feat: track resource replacements when claiming a prebuilt workspace (#17571)

Closes https://github.com/coder/internal/issues/369

We can't know whether a replacement (i.e. drift of terraform state
leading to a resource needing to be deleted/recreated) will take place
apriori; we can only detect it at `plan` time, because the provider
decides whether a resource must be replaced and it cannot be inferred
through static analysis of the template.

**This is likely to be the most common gotcha with using prebuilds,
since it requires a slight template modification to use prebuilds
effectively**, so let's head this off before it's an issue for
customers.

Drift details will now be logged in the workspace build logs:


![image](https://github.com/user-attachments/assets/da1988b6-2cbe-4a79-a3c5-ea29891f3d6f)

Plus a notification will be sent to template admins when this situation
arises:


![image](https://github.com/user-attachments/assets/39d555b1-a262-4a3e-b529-03b9f23bf66a)

A new metric - `coderd_prebuilt_workspaces_resource_replacements_total`
- will also increment each time a workspace encounters replacements.

We only track _that_ a resource replacement occurred, not how many. Just
one is enough to ruin a prebuild, but we can't know apriori which
replacement would cause this.
For example, say we have 2 replacements: a `docker_container` and a
`null_resource`; we don't know which one might
cause an issue (or indeed if either would), so we just track the
replacement.

---------

Signed-off-by: Danny Kopping <dannykopping@gmail.com>
This commit is contained in:
Danny Kopping
2025-05-14 14:52:22 +02:00
committed by GitHub
parent e75d1c1ce5
commit 6e967780c9
33 changed files with 2048 additions and 969 deletions

View File

@ -1165,6 +1165,6 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio
}
reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds,
api.Logger.Named("prebuilds"), quartz.NewReal(), api.PrometheusRegistry)
api.Logger.Named("prebuilds"), quartz.NewReal(), api.PrometheusRegistry, api.NotificationsEnqueuer)
return reconciler, prebuilds.NewEnterpriseClaimer(api.Database)
}

View File

@ -147,7 +147,7 @@ func TestClaimPrebuild(t *testing.T) {
EntitlementsUpdateInterval: time.Second,
})
reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry())
reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy)
api.AGPL.PrebuildsClaimer.Store(&claimer)

View File

@ -3,6 +3,7 @@ package prebuilds
import (
"context"
"fmt"
"sync"
"sync/atomic"
"time"
@ -16,50 +17,73 @@ import (
"github.com/coder/coder/v2/coderd/prebuilds"
)
const (
namespace = "coderd_prebuilt_workspaces_"
MetricCreatedCount = namespace + "created_total"
MetricFailedCount = namespace + "failed_total"
MetricClaimedCount = namespace + "claimed_total"
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
)
var (
labels = []string{"template_name", "preset_name", "organization_name"}
createdPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_created_total",
MetricCreatedCount,
"Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+
"template preset.",
labels,
nil,
)
failedPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_failed_total",
MetricFailedCount,
"Total number of prebuilt workspaces that failed to build.",
labels,
nil,
)
claimedPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_claimed_total",
MetricClaimedCount,
"Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+
"with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.",
labels,
nil,
)
resourceReplacementsDesc = prometheus.NewDesc(
MetricResourceReplacementsCount,
"Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. "+
"In Terraform, drift on immutable attributes results in resource replacement. "+
"This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource "+
"would have been recreated when claiming, thus obviating the point of pre-provisioning. "+
"See https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement",
labels,
nil,
)
desiredPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_desired",
MetricDesiredGauge,
"Target number of prebuilt workspaces that should be available for each template preset.",
labels,
nil,
)
runningPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_running",
MetricRunningGauge,
"Current number of prebuilt workspaces that are in a running state. These workspaces have started "+
"successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).",
labels,
nil,
)
eligiblePrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_eligible",
MetricEligibleGauge,
"Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+
"have completed their build process with their agent reporting 'ready' status.",
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_metrics_last_updated",
MetricLastUpdatedGauge,
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
[]string{},
nil,
@ -77,6 +101,9 @@ type MetricsCollector struct {
snapshotter prebuilds.StateSnapshotter
latestState atomic.Pointer[metricsState]
replacementsCounter map[replacementKey]float64
replacementsCounterMu sync.Mutex
}
var _ prometheus.Collector = new(MetricsCollector)
@ -84,9 +111,10 @@ var _ prometheus.Collector = new(MetricsCollector)
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")
return &MetricsCollector{
database: db,
logger: log,
snapshotter: snapshotter,
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
}
}
@ -94,6 +122,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- createdPrebuildsDesc
descCh <- failedPrebuildsDesc
descCh <- claimedPrebuildsDesc
descCh <- resourceReplacementsDesc
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
@ -117,6 +146,12 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
}
mc.replacementsCounterMu.Lock()
for key, val := range mc.replacementsCounter {
metricsCh <- prometheus.MustNewConstMetric(resourceReplacementsDesc, prometheus.CounterValue, val, key.templateName, key.presetName, key.orgName)
}
mc.replacementsCounterMu.Unlock()
for _, preset := range currentState.snapshot.Presets {
if !preset.UsingActiveVersion {
continue
@ -187,3 +222,24 @@ func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Durati
})
return nil
}
type replacementKey struct {
orgName, templateName, presetName string
}
func (k replacementKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}
func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, presetName string) {
mc.replacementsCounterMu.Lock()
defer mc.replacementsCounterMu.Unlock()
key := replacementKey{orgName: orgName, templateName: templateName, presetName: presetName}
// We only track _that_ a resource replacement occurred, not how many.
// Just one is enough to ruin a prebuild, but we can't know apriori which replacement would cause this.
// For example, say we have 2 replacements: a docker_container and a null_resource; we don't know which one might
// cause an issue (or indeed if either would), so we just track the replacement.
mc.replacementsCounter[key]++
}

View File

@ -57,12 +57,12 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
@ -74,12 +74,12 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
@ -91,11 +91,11 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricFailedCount, ptr.To(1.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
@ -107,12 +107,12 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(1.0), false},
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(1.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{true},
@ -124,12 +124,12 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
@ -141,11 +141,11 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(1.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
@ -157,9 +157,9 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{uuid.New()},
ownerIDs: []uuid.UUID{uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
@ -171,7 +171,7 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_desired", ptr.To(0.0), false},
{prebuilds.MetricDesiredGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
@ -183,8 +183,8 @@ func TestMetricsCollector(t *testing.T) {
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
@ -220,7 +220,7 @@ func TestMetricsCollector(t *testing.T) {
})
clock := quartz.NewMock(t)
db, pubsub := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry())
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
ctx := testutil.Context(t, testutil.WaitLong)
createdUsers := []uuid.UUID{agplprebuilds.SystemUserID}
@ -242,7 +242,7 @@ func TestMetricsCollector(t *testing.T) {
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
workspace := setupTestDBWorkspace(
workspace, _ := setupTestDBWorkspace(
t, clock, db, pubsub,
transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID,
)

View File

@ -3,8 +3,10 @@ package prebuilds
import (
"context"
"database/sql"
"errors"
"fmt"
"math"
"strings"
"sync"
"sync/atomic"
"time"
@ -19,11 +21,13 @@ import (
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
"github.com/coder/coder/v2/coderd/wsbuilder"
"github.com/coder/coder/v2/codersdk"
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
"cdr.dev/slog"
@ -40,6 +44,7 @@ type StoreReconciler struct {
clock quartz.Clock
registerer prometheus.Registerer
metrics *MetricsCollector
notifEnq notifications.Enqueuer
cancelFn context.CancelCauseFunc
running atomic.Bool
@ -56,6 +61,7 @@ func NewStoreReconciler(store database.Store,
logger slog.Logger,
clock quartz.Clock,
registerer prometheus.Registerer,
notifEnq notifications.Enqueuer,
) *StoreReconciler {
reconciler := &StoreReconciler{
store: store,
@ -64,6 +70,7 @@ func NewStoreReconciler(store database.Store,
cfg: cfg,
clock: clock,
registerer: registerer,
notifEnq: notifEnq,
done: make(chan struct{}, 1),
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
}
@ -633,3 +640,124 @@ func (c *StoreReconciler) provision(
return nil
}
// ForceMetricsUpdate forces the metrics collector, if defined, to update its state (we cache the metrics state to
// reduce load on the database).
func (c *StoreReconciler) ForceMetricsUpdate(ctx context.Context) error {
if c.metrics == nil {
return nil
}
return c.metrics.UpdateState(ctx, time.Second*10)
}
func (c *StoreReconciler) TrackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement) {
// nolint:gocritic // Necessary to query all the required data.
ctx = dbauthz.AsSystemRestricted(ctx)
// Since this may be called in a fire-and-forget fashion, we need to give up at some point.
trackCtx, trackCancel := context.WithTimeout(ctx, time.Minute)
defer trackCancel()
if err := c.trackResourceReplacement(trackCtx, workspaceID, buildID, replacements); err != nil {
c.logger.Error(ctx, "failed to track resource replacement", slog.Error(err))
}
}
// nolint:revive // Shut up it's fine.
func (c *StoreReconciler) trackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement) error {
if err := ctx.Err(); err != nil {
return err
}
workspace, err := c.store.GetWorkspaceByID(ctx, workspaceID)
if err != nil {
return xerrors.Errorf("fetch workspace %q: %w", workspaceID.String(), err)
}
build, err := c.store.GetWorkspaceBuildByID(ctx, buildID)
if err != nil {
return xerrors.Errorf("fetch workspace build %q: %w", buildID.String(), err)
}
// The first build will always be the prebuild.
prebuild, err := c.store.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, database.GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams{
WorkspaceID: workspaceID, BuildNumber: 1,
})
if err != nil {
return xerrors.Errorf("fetch prebuild: %w", err)
}
// This should not be possible, but defend against it.
if !prebuild.TemplateVersionPresetID.Valid || prebuild.TemplateVersionPresetID.UUID == uuid.Nil {
return xerrors.Errorf("no preset used in prebuild for workspace %q", workspaceID.String())
}
prebuildPreset, err := c.store.GetPresetByID(ctx, prebuild.TemplateVersionPresetID.UUID)
if err != nil {
return xerrors.Errorf("fetch template preset for template version ID %q: %w", prebuild.TemplateVersionID.String(), err)
}
claimant, err := c.store.GetUserByID(ctx, workspace.OwnerID) // At this point, the workspace is owned by the new owner.
if err != nil {
return xerrors.Errorf("fetch claimant %q: %w", workspace.OwnerID.String(), err)
}
// Use the claiming build here (not prebuild) because both should be equivalent, and we might as well spot inconsistencies now.
templateVersion, err := c.store.GetTemplateVersionByID(ctx, build.TemplateVersionID)
if err != nil {
return xerrors.Errorf("fetch template version %q: %w", build.TemplateVersionID.String(), err)
}
org, err := c.store.GetOrganizationByID(ctx, workspace.OrganizationID)
if err != nil {
return xerrors.Errorf("fetch org %q: %w", workspace.OrganizationID.String(), err)
}
// Track resource replacement in Prometheus metric.
if c.metrics != nil {
c.metrics.trackResourceReplacement(org.Name, workspace.TemplateName, prebuildPreset.Name)
}
// Send notification to template admins.
if c.notifEnq == nil {
c.logger.Warn(ctx, "notification enqueuer not set, cannot send resource replacement notification(s)")
return nil
}
repls := make(map[string]string, len(replacements))
for _, repl := range replacements {
repls[repl.GetResource()] = strings.Join(repl.GetPaths(), ", ")
}
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
RbacRole: []string{codersdk.RoleTemplateAdmin},
})
if err != nil {
return xerrors.Errorf("fetch template admins: %w", err)
}
var notifErr error
for _, templateAdmin := range templateAdmins {
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.TemplateWorkspaceResourceReplaced,
map[string]string{
"org": org.Name,
"workspace": workspace.Name,
"template": workspace.TemplateName,
"template_version": templateVersion.Name,
"preset": prebuildPreset.Name,
"workspace_build_num": fmt.Sprintf("%d", build.BuildNumber),
"claimant": claimant.Username,
},
map[string]any{
"replacements": repls,
}, "prebuilds_reconciler",
// Associate this notification with all the related entities.
workspace.ID, workspace.OwnerID, workspace.TemplateID, templateVersion.ID, prebuildPreset.ID, workspace.OrganizationID,
); err != nil {
notifErr = errors.Join(xerrors.Errorf("send notification to %q: %w", templateAdmin.ID.String(), err))
continue
}
}
return notifErr
}

View File

@ -9,10 +9,14 @@ import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
"github.com/coder/coder/v2/coderd/util/slice"
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
@ -49,7 +53,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) {
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
}
logger := testutil.Logger(t)
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
// given a template version with no presets
org := dbgen.Organization(t, db, database.Organization{})
@ -94,7 +98,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) {
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
}
logger := testutil.Logger(t)
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
// given there are presets, but no prebuilds
org := dbgen.Organization(t, db, database.Organization{})
@ -345,7 +349,7 @@ func TestPrebuildReconciliation(t *testing.T) {
1,
uuid.New().String(),
)
prebuild := setupTestDBPrebuild(
prebuild, _ := setupTestDBPrebuild(
t,
clock,
db,
@ -367,7 +371,7 @@ func TestPrebuildReconciliation(t *testing.T) {
if useBrokenPubsub {
pubSub = &brokenPublisher{Pubsub: pubSub}
}
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
// Run the reconciliation multiple times to ensure idempotency
// 8 was arbitrary, but large enough to reasonably trust the result
@ -444,7 +448,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
ownerID := uuid.New()
dbgen.User(t, db, database.User{
@ -477,7 +481,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
)
prebuildIDs := make([]uuid.UUID, 0)
for i := 0; i < int(preset.DesiredInstances.Int32); i++ {
prebuild := setupTestDBPrebuild(
prebuild, _ := setupTestDBPrebuild(
t,
clock,
db,
@ -528,7 +532,7 @@ func TestInvalidPreset(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
ownerID := uuid.New()
dbgen.User(t, db, database.User{
@ -592,7 +596,7 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
ownerID := uuid.New()
dbgen.User(t, db, database.User{
@ -601,7 +605,7 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) {
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
prebuiltWorkspace := setupTestDBPrebuild(
prebuiltWorkspace, _ := setupTestDBPrebuild(
t,
clock,
db,
@ -669,7 +673,7 @@ func TestRunLoop(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry())
reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer())
ownerID := uuid.New()
dbgen.User(t, db, database.User{
@ -702,7 +706,7 @@ func TestRunLoop(t *testing.T) {
)
prebuildIDs := make([]uuid.UUID, 0)
for i := 0; i < int(preset.DesiredInstances.Int32); i++ {
prebuild := setupTestDBPrebuild(
prebuild, _ := setupTestDBPrebuild(
t,
clock,
db,
@ -799,7 +803,7 @@ func TestFailedBuildBackoff(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug)
db, ps := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock, prometheus.NewRegistry())
reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock, prometheus.NewRegistry(), newNoopEnqueuer())
// Given: an active template version with presets and prebuilds configured.
const desiredInstances = 2
@ -812,7 +816,7 @@ func TestFailedBuildBackoff(t *testing.T) {
preset := setupTestDBPreset(t, db, templateVersionID, desiredInstances, "test")
for range desiredInstances {
_ = setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, database.ProvisionerJobStatusFailed, org.ID, preset, template.ID, templateVersionID)
_, _ = setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, database.ProvisionerJobStatusFailed, org.ID, preset, template.ID, templateVersionID)
}
// When: determining what actions to take next, backoff is calculated because the prebuild is in a failed state.
@ -873,7 +877,7 @@ func TestFailedBuildBackoff(t *testing.T) {
if i == 1 {
status = database.ProvisionerJobStatusSucceeded
}
_ = setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, status, org.ID, preset, template.ID, templateVersionID)
_, _ = setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, status, org.ID, preset, template.ID, templateVersionID)
}
// Then: the backoff time is roughly equal to two backoff intervals, since another build has failed.
@ -914,7 +918,8 @@ func TestReconciliationLock(t *testing.T) {
codersdk.PrebuildsConfig{},
slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug),
quartz.NewMock(t),
prometheus.NewRegistry())
prometheus.NewRegistry(),
newNoopEnqueuer())
reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error {
lockObtained := mutex.TryLock()
// As long as the postgres lock is held, this mutex should always be unlocked when we get here.
@ -931,6 +936,102 @@ func TestReconciliationLock(t *testing.T) {
wg.Wait()
}
func TestTrackResourceReplacement(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("This test requires postgres")
}
ctx := testutil.Context(t, testutil.WaitSuperLong)
// Setup.
clock := quartz.NewMock(t)
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: false}).Leveled(slog.LevelDebug)
db, ps := dbtestutil.NewDB(t)
fakeEnqueuer := newFakeEnqueuer()
registry := prometheus.NewRegistry()
reconciler := prebuilds.NewStoreReconciler(db, ps, codersdk.PrebuildsConfig{}, logger, clock, registry, fakeEnqueuer)
// Given: a template admin to receive a notification.
templateAdmin := dbgen.User(t, db, database.User{
RBACRoles: []string{codersdk.RoleTemplateAdmin},
})
// Given: a prebuilt workspace.
userID := uuid.New()
dbgen.User(t, db, database.User{ID: userID})
org, template := setupTestDBTemplate(t, db, userID, false)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, userID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, "b0rked")
prebuiltWorkspace, prebuild := setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, database.ProvisionerJobStatusSucceeded, org.ID, preset, template.ID, templateVersionID)
// Given: no replacement has been tracked yet, we should not see a metric for it yet.
require.NoError(t, reconciler.ForceMetricsUpdate(ctx))
mf, err := registry.Gather()
require.NoError(t, err)
require.Nil(t, findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
}))
// When: a claim occurred and resource replacements are detected (_how_ is out of scope of this test).
reconciler.TrackResourceReplacement(ctx, prebuiltWorkspace.ID, prebuild.ID, []*sdkproto.ResourceReplacement{
{
Resource: "docker_container[0]",
Paths: []string{"env", "image"},
},
{
Resource: "docker_volume[0]",
Paths: []string{"name"},
},
})
// Then: a notification will be sent detailing the replacement(s).
matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool {
// This is not an exhaustive check of the expected labels/data in the notification. This would tie the implementations
// too tightly together.
// All we need to validate is that a template of the right kind was sent, to the expected user, with some replacements.
if !assert.Equal(t, notification.TemplateID, notifications.TemplateWorkspaceResourceReplaced, "unexpected template") {
return false
}
if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") {
return false
}
if !assert.Len(t, notification.Data["replacements"], 2, "unexpected replacements count") {
return false
}
return true
})
require.Len(t, matching, 1)
// Then: the metric will be incremented.
mf, err = registry.Gather()
require.NoError(t, err)
metric := findMetric(mf, prebuilds.MetricResourceReplacementsCount, map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"org_name": org.Name,
})
require.NotNil(t, metric)
require.NotNil(t, metric.GetCounter())
require.EqualValues(t, 1, metric.GetCounter().GetValue())
}
func newNoopEnqueuer() *notifications.NoopEnqueuer {
return notifications.NewNoopEnqueuer()
}
func newFakeEnqueuer() *notificationstest.FakeEnqueuer {
return notificationstest.NewFakeEnqueuer()
}
// nolint:revive // It's a control flag, but this is a test.
func setupTestDBTemplate(
t *testing.T,
@ -1040,7 +1141,7 @@ func setupTestDBPrebuild(
preset database.TemplateVersionPreset,
templateID uuid.UUID,
templateVersionID uuid.UUID,
) database.WorkspaceTable {
) (database.WorkspaceTable, database.WorkspaceBuild) {
t.Helper()
return setupTestDBWorkspace(t, clock, db, ps, transition, prebuildStatus, orgID, preset, templateID, templateVersionID, agplprebuilds.SystemUserID, agplprebuilds.SystemUserID)
}
@ -1058,7 +1159,7 @@ func setupTestDBWorkspace(
templateVersionID uuid.UUID,
initiatorID uuid.UUID,
ownerID uuid.UUID,
) database.WorkspaceTable {
) (database.WorkspaceTable, database.WorkspaceBuild) {
t.Helper()
cancelledAt := sql.NullTime{}
completedAt := sql.NullTime{}
@ -1117,7 +1218,7 @@ func setupTestDBWorkspace(
},
})
return workspace
return workspace, workspaceBuild
}
// nolint:revive // It's a control flag, but this is a test.

View File

@ -19,6 +19,8 @@ import (
"storj.io/drpc/drpcserver"
"cdr.dev/slog"
"github.com/coder/websocket"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
@ -34,7 +36,6 @@ import (
"github.com/coder/coder/v2/codersdk/drpcsdk"
"github.com/coder/coder/v2/provisionerd/proto"
"github.com/coder/coder/v2/provisionersdk"
"github.com/coder/websocket"
)
func (api *API) provisionerDaemonsEnabledMW(next http.Handler) http.Handler {
@ -357,6 +358,7 @@ func (api *API) provisionerDaemonServe(rw http.ResponseWriter, r *http.Request)
Clock: api.Clock,
},
api.NotificationsEnqueuer,
&api.AGPL.PrebuildsReconciler,
)
if err != nil {
if !xerrors.Is(err, context.Canceled) {