diff --git a/coderd/database/lock.go b/coderd/database/lock.go index d8c76b18a4..a0d0a92be5 100644 --- a/coderd/database/lock.go +++ b/coderd/database/lock.go @@ -13,6 +13,7 @@ const ( LockIDNotificationsReportGenerator LockIDCryptoKeyRotation LockIDReconcileTemplatePrebuilds + LockIDDeterminePrebuildsState ) // GenLockID generates a unique and consistent lock ID from a given string. diff --git a/enterprise/coderd/prebuilds/controller.go b/enterprise/coderd/prebuilds/controller.go index bf3f95cc4f..acb3c61de4 100644 --- a/enterprise/coderd/prebuilds/controller.go +++ b/enterprise/coderd/prebuilds/controller.go @@ -6,18 +6,15 @@ import ( "database/sql" "encoding/base32" "fmt" - "math" - mrand "math/rand" "strings" "sync/atomic" "time" - "github.com/coder/coder/v2/coderd/util/slice" "github.com/hashicorp/go-multierror" - "golang.org/x/exp/slices" - "github.com/coder/coder/v2/coderd/audit" + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/database/provisionerjobs" "github.com/coder/coder/v2/coderd/database/pubsub" @@ -31,9 +28,6 @@ import ( "github.com/google/uuid" "golang.org/x/sync/errgroup" "golang.org/x/xerrors" - - "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/database/dbauthz" ) type Controller struct { @@ -134,39 +128,41 @@ func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) { return nil } - defer logger.Debug(ctx, "acquired top-level prebuilds reconciliation lock", slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", time.Since(start).Seconds()))) - - innerCtx, cancel := context.WithTimeout(ctx, time.Second*30) - defer cancel() + logger.Debug(ctx, "acquired top-level prebuilds reconciliation lock", slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", time.Since(start).Seconds()))) var id uuid.NullUUID if templateID != nil { id.UUID = *templateID + id.Valid = true } - presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, id) - if len(presetsWithPrebuilds) == 0 { - logger.Debug(innerCtx, "no templates found with prebuilds configured") + state, err := c.determineState(ctx, db, id) + if err != nil { + return xerrors.Errorf("determine current state: %w", err) + } + if len(state.presets) == 0 { + logger.Debug(ctx, "no templates found with prebuilds configured") return nil } - runningPrebuilds, err := db.GetRunningPrebuilds(ctx) - if err != nil { - return xerrors.Errorf("failed to get running prebuilds: %w", err) - } - - prebuildsInProgress, err := db.GetPrebuildsInProgress(ctx) - if err != nil { - return xerrors.Errorf("failed to get prebuilds in progress: %w", err) - } - // TODO: bounded concurrency? probably not but consider var eg errgroup.Group - for _, preset := range presetsWithPrebuilds { + for _, preset := range state.presets { + ps, err := state.filterByPreset(preset.PresetID) + if err != nil { + logger.Warn(ctx, "failed to find preset state", slog.Error(err), slog.F("preset_id", preset.PresetID.String())) + continue + } + + if !preset.UsingActiveVersion && len(ps.running) == 0 && len(ps.inProgress) == 0 { + logger.Debug(ctx, "skipping reconciliation for preset; inactive, no running prebuilds, and no in-progress operationss", + slog.F("preset_id", preset.PresetID.String())) + continue + } + eg.Go(func() error { // Pass outer context. - // TODO: name these better to avoid the comment. - err := c.reconcilePrebuildsForPreset(ctx, preset, runningPrebuilds, prebuildsInProgress) + err := c.reconcilePrebuildsForPreset(ctx, ps) if err != nil { logger.Error(ctx, "failed to reconcile prebuilds for preset", slog.Error(err), slog.F("preset_id", preset.PresetID)) } @@ -186,186 +182,64 @@ func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) { } } -type reconciliationActions struct { - deleteIDs []uuid.UUID - createIDs []uuid.UUID +// determineState determines the current state of prebuilds & the presets which define them. +// This function MUST be called within +func (c *Controller) determineState(ctx context.Context, store database.Store, id uuid.NullUUID) (*reconciliationState, error) { + if err := ctx.Err(); err != nil { + return nil, err + } - actual int32 // Running prebuilds for active version. - desired int32 // Active template version's desired instances as defined in preset. - eligible int32 // Prebuilds which can be claimed. - outdated int32 // Prebuilds which no longer match the active template version. - extraneous int32 // Extra running prebuilds for active version (somehow). - starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down. + var state reconciliationState + + err := store.InTx(func(db database.Store) error { + start := time.Now() + + // TODO: give up after some time waiting on this? + err := db.AcquireLock(ctx, database.LockIDDeterminePrebuildsState) + if err != nil { + return xerrors.Errorf("failed to acquire state determination lock: %w", err) + } + + c.logger.Debug(ctx, "acquired state determination lock", slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", time.Since(start).Seconds()))) + + presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, id) + if len(presetsWithPrebuilds) == 0 { + return nil + } + + allRunningPrebuilds, err := db.GetRunningPrebuilds(ctx) + if err != nil { + return xerrors.Errorf("failed to get running prebuilds: %w", err) + } + + allPrebuildsInProgress, err := db.GetPrebuildsInProgress(ctx) + if err != nil { + return xerrors.Errorf("failed to get prebuilds in progress: %w", err) + } + + state = newReconciliationState(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress) + return nil + }, &database.TxOptions{ + Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs + ReadOnly: true, + TxIdentifier: "prebuilds_state_determination", + }) + + return &state, err } -// calculateActions MUST be called within the context of a transaction (TODO: isolation) -// with an advisory lock to prevent TOCTOU races. -func (c *Controller) calculateActions(ctx context.Context, preset database.GetTemplatePresetsWithPrebuildsRow, running []database.GetRunningPrebuildsRow, inProgress []database.GetPrebuildsInProgressRow) (*reconciliationActions, error) { - // TODO: align workspace states with how we represent them on the FE and the CLI - // right now there's some slight differences which can lead to additional prebuilds being created - - // TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is - // about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races! - - var ( - actual int32 // Running prebuilds for active version. - desired int32 // Active template version's desired instances as defined in preset. - eligible int32 // Prebuilds which can be claimed. - outdated int32 // Prebuilds which no longer match the active template version. - extraneous int32 // Extra running prebuilds for active version (somehow). - starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down. - ) - - if preset.UsingActiveVersion { - actual = int32(len(running)) - desired = preset.DesiredInstances +func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, ps *presetState) error { + if ps == nil { + return xerrors.Errorf("unexpected nil preset state") } - for _, prebuild := range running { - if preset.UsingActiveVersion { - if prebuild.Eligible { - eligible++ - } - - extraneous = int32(math.Max(float64(actual-preset.DesiredInstances), 0)) - } - - if prebuild.TemplateVersionID == preset.TemplateVersionID && !preset.UsingActiveVersion { - outdated++ - } - } - - for _, progress := range inProgress { - switch progress.Transition { - case database.WorkspaceTransitionStart: - starting++ - case database.WorkspaceTransitionStop: - stopping++ - case database.WorkspaceTransitionDelete: - deleting++ - default: - c.logger.Warn(ctx, "unknown transition found in prebuilds in progress result", slog.F("transition", progress.Transition)) - } - } - - var ( - toCreate = int(math.Max(0, float64( - desired- // The number specified in the preset - (actual+starting)- // The current number of prebuilds (or builds in-flight) - stopping), // The number of prebuilds currently being stopped (should be 0) - )) - toDelete = int(math.Max(0, float64( - outdated- // The number of prebuilds running above the desired count for active version - deleting), // The number of prebuilds currently being deleted - )) - - actions = &reconciliationActions{ - actual: actual, - desired: desired, - eligible: eligible, - outdated: outdated, - extraneous: extraneous, - starting: starting, - stopping: stopping, - deleting: deleting, - } - ) - - // Bail early to avoid scheduling new prebuilds while operations are in progress. - if (toCreate+toDelete) > 0 && (starting+stopping+deleting) > 0 { - c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation", - slog.F("template_id", preset.TemplateID.String()), slog.F("starting", starting), - slog.F("stopping", stopping), slog.F("deleting", deleting), - slog.F("wanted_to_create", toCreate), slog.F("wanted_to_delete", toDelete)) - return actions, nil - } - - // It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so - // we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted. - if extraneous > 0 { - // Sort running IDs randomly so we can pick random victims. - slices.SortFunc(running, func(_, _ database.GetRunningPrebuildsRow) int { - if mrand.Float64() > 0.5 { - return -1 - } - - return 1 - }) - - var victims []uuid.UUID - for i := 0; i < int(extraneous); i++ { - if i >= len(running) { - // This should never happen. - c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!", - slog.F("running_count", len(running)), - slog.F("extraneous", extraneous)) - continue - } - - victims = append(victims, running[i].WorkspaceID) - } - - actions.deleteIDs = append(actions.deleteIDs, victims...) - - c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)", - slog.F("template_id", preset.TemplateID.String()), slog.F("desired", desired), slog.F("actual", actual), slog.F("extra", extraneous), - slog.F("victims", victims)) - - // Prevent the rest of the reconciliation from completing - return actions, nil - } - - // If the template has become deleted or deprecated since the last reconciliation, we need to ensure we - // scale those prebuilds down to zero. - if preset.Deleted || preset.Deprecated { - toCreate = 0 - toDelete = int(actual + outdated) - } - - for i := 0; i < toCreate; i++ { - actions.createIDs = append(actions.createIDs, uuid.New()) - } - - if toDelete > 0 && len(running) != toDelete { - c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!", - slog.F("template_id", preset.TemplateID.String()), slog.F("running", len(running)), slog.F("to_delete", toDelete)) - } - - // TODO: implement lookup to not perform same action on workspace multiple times in $period - // i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion - for i := 0; i < toDelete; i++ { - if i >= len(running) { - // Above warning will have already addressed this. - continue - } - - actions.deleteIDs = append(actions.deleteIDs, running[i].WorkspaceID) - } - - return actions, nil -} - -func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, preset database.GetTemplatePresetsWithPrebuildsRow, - allRunning []database.GetRunningPrebuildsRow, allInProgress []database.GetPrebuildsInProgressRow, -) error { - logger := c.logger.With(slog.F("template_id", preset.TemplateID.String())) + logger := c.logger.With(slog.F("template_id", ps.preset.TemplateID.String())) var lastErr multierror.Error - vlogger := logger.With(slog.F("template_version_id", preset.TemplateVersionID), slog.F("preset_id", preset.PresetID)) + vlogger := logger.With(slog.F("template_version_id", ps.preset.TemplateVersionID), slog.F("preset_id", ps.preset.PresetID)) - running := slice.Filter(allRunning, func(prebuild database.GetRunningPrebuildsRow) bool { - if !prebuild.DesiredPresetID.Valid && !prebuild.CurrentPresetID.Valid { - return false - } - return prebuild.CurrentPresetID.UUID == preset.PresetID && - prebuild.TemplateVersionID == preset.TemplateVersionID // Not strictly necessary since presets are 1:1 with template versions, but no harm in being extra safe. - }) - - inProgress := slice.Filter(allInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool { - return prebuild.TemplateVersionID == preset.TemplateVersionID - }) - - actions, err := c.calculateActions(ctx, preset, running, inProgress) + // TODO: move log lines up from calculateActions. + actions, err := ps.calculateActions() if err != nil { vlogger.Error(ctx, "failed to calculate reconciliation actions", slog.Error(err)) return xerrors.Errorf("failed to calculate reconciliation actions: %w", err) @@ -380,12 +254,12 @@ func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, preset dat }) levelFn := vlogger.Debug - if len(actions.createIDs) > 0 || len(actions.deleteIDs) > 0 { + if actions.create > 0 || len(actions.deleteIDs) > 0 { // Only log with info level when there's a change that needs to be effected. levelFn = vlogger.Info } levelFn(ctx, "template prebuild state retrieved", - slog.F("to_create", len(actions.createIDs)), slog.F("to_delete", len(actions.deleteIDs)), + slog.F("to_create", actions.create), slog.F("to_delete", len(actions.deleteIDs)), slog.F("desired", actions.desired), slog.F("actual", actions.actual), slog.F("outdated", actions.outdated), slog.F("extraneous", actions.extraneous), slog.F("starting", actions.starting), slog.F("stopping", actions.stopping), @@ -396,15 +270,15 @@ func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, preset dat // TODO: max per reconciliation iteration? // TODO: i've removed the surrounding tx, but if we restore it then we need to pass down the store to these funcs. - for _, id := range actions.createIDs { - if err := c.createPrebuild(ownerCtx, id, preset.TemplateID, preset.PresetID); err != nil { + for range actions.create { + if err := c.createPrebuild(ownerCtx, uuid.New(), ps.preset.TemplateID, ps.preset.PresetID); err != nil { vlogger.Error(ctx, "failed to create prebuild", slog.Error(err)) lastErr.Errors = append(lastErr.Errors, err) } } for _, id := range actions.deleteIDs { - if err := c.deletePrebuild(ownerCtx, id, preset.TemplateID, preset.PresetID); err != nil { + if err := c.deletePrebuild(ownerCtx, id, ps.preset.TemplateID, ps.preset.PresetID); err != nil { vlogger.Error(ctx, "failed to delete prebuild", slog.Error(err)) lastErr.Errors = append(lastErr.Errors, err) } diff --git a/enterprise/coderd/prebuilds/controller_test.go b/enterprise/coderd/prebuilds/controller_test.go new file mode 100644 index 0000000000..55e6eada05 --- /dev/null +++ b/enterprise/coderd/prebuilds/controller_test.go @@ -0,0 +1,130 @@ +package prebuilds + +import ( + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/database" +) + +var ( + templateID = uuid.New() + templateVersionID = uuid.New() + presetID = uuid.New() + preset2ID = uuid.New() + prebuildID = uuid.New() +) + +func TestReconciliationActions(t *testing.T) { + cases := map[string]struct { + preset database.GetTemplatePresetsWithPrebuildsRow // TODO: make own structs; reusing these types is lame + running []database.GetRunningPrebuildsRow + inProgress []database.GetPrebuildsInProgressRow + expected reconciliationActions + }{ + // New template version created which adds a new preset with prebuilds configured. + "CreateNetNew": { + preset: preset(true, 1), + expected: reconciliationActions{ + desired: 1, + create: 1, + }, + }, + // New template version created, making an existing preset and its prebuilds outdated. + "DeleteOutdated": { + preset: preset(false, 1), + running: []database.GetRunningPrebuildsRow{ + { + WorkspaceID: prebuildID, + TemplateID: templateID, + TemplateVersionID: templateVersionID, + CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true}, + DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}, + Ready: true, + }, + }, + expected: reconciliationActions{ + outdated: 1, + deleteIDs: []uuid.UUID{prebuildID}, + }, + }, + // Somehow an additional prebuild is running, delete it. + // This can happen if an operator messes with a prebuild's state (stop, start). + "DeleteOldestExtraneous": { + preset: preset(true, 1), + running: []database.GetRunningPrebuildsRow{ + { + WorkspaceID: prebuildID, + TemplateID: templateID, + TemplateVersionID: templateVersionID, + CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true}, + DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}, + CreatedAt: time.Now().Add(-time.Hour), + }, + { + WorkspaceID: uuid.New(), + TemplateID: templateID, + TemplateVersionID: templateVersionID, + CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true}, + DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}, + CreatedAt: time.Now(), + }, + }, + expected: reconciliationActions{ + desired: 1, + extraneous: 1, + actual: 2, + deleteIDs: []uuid.UUID{prebuildID}, + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + t.Parallel() + + ps := presetState{ + preset: tc.preset, + running: tc.running, + inProgress: tc.inProgress, + } + + actions, err := ps.calculateActions() + require.NoError(t, err, "could not calculate reconciliation actions") + + validateActions(t, tc.expected, *actions) + }) + } +} + +func preset(active bool, instances int32) database.GetTemplatePresetsWithPrebuildsRow { + return database.GetTemplatePresetsWithPrebuildsRow{ + TemplateID: templateID, + TemplateVersionID: templateVersionID, + UsingActiveVersion: active, + PresetID: presetID, + Name: "bob", + DesiredInstances: instances, + Deleted: false, + Deprecated: false, + } +} + +// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for +// prebuilds align with zero values. +func validateActions(t *testing.T, expected, actual reconciliationActions) bool { + return assert.EqualValuesf(t, expected.deleteIDs, actual.deleteIDs, "'deleteIDs' did not match expectation") && + assert.EqualValuesf(t, expected.create, actual.create, "'create' did not match expectation") && + assert.EqualValuesf(t, expected.desired, actual.desired, "'desired' did not match expectation") && + assert.EqualValuesf(t, expected.actual, actual.actual, "'actual' did not match expectation") && + assert.EqualValuesf(t, expected.eligible, actual.eligible, "'eligible' did not match expectation") && + assert.EqualValuesf(t, expected.extraneous, actual.extraneous, "'extraneous' did not match expectation") && + assert.EqualValuesf(t, expected.outdated, actual.outdated, "'outdated' did not match expectation") && + assert.EqualValuesf(t, expected.starting, actual.starting, "'starting' did not match expectation") && + assert.EqualValuesf(t, expected.stopping, actual.stopping, "'stopping' did not match expectation") && + assert.EqualValuesf(t, expected.deleting, actual.deleting, "'deleting' did not match expectation") +} diff --git a/enterprise/coderd/prebuilds/reconciliation.go b/enterprise/coderd/prebuilds/reconciliation.go new file mode 100644 index 0000000000..2482f1d050 --- /dev/null +++ b/enterprise/coderd/prebuilds/reconciliation.go @@ -0,0 +1,219 @@ +package prebuilds + +import ( + "math" + "slices" + + "github.com/google/uuid" + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/util/slice" +) + +type reconciliationState struct { + presets []database.GetTemplatePresetsWithPrebuildsRow + runningPrebuilds []database.GetRunningPrebuildsRow + prebuildsInProgress []database.GetPrebuildsInProgressRow +} + +type presetState struct { + preset database.GetTemplatePresetsWithPrebuildsRow + running []database.GetRunningPrebuildsRow + inProgress []database.GetPrebuildsInProgressRow +} + +type reconciliationActions struct { + actual int32 // Running prebuilds for active version. + desired int32 // Active template version's desired instances as defined in preset. + eligible int32 // Prebuilds which can be claimed. + outdated int32 // Prebuilds which no longer match the active template version. + extraneous int32 // Extra running prebuilds for active version (somehow). + starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down. + create int32 // The number of prebuilds required to be created to reconcile required state. + deleteIDs []uuid.UUID // IDs of running prebuilds required to be deleted to reconcile required state. +} + +func newReconciliationState(presets []database.GetTemplatePresetsWithPrebuildsRow, runningPrebuilds []database.GetRunningPrebuildsRow, prebuildsInProgress []database.GetPrebuildsInProgressRow) reconciliationState { + return reconciliationState{presets: presets, runningPrebuilds: runningPrebuilds, prebuildsInProgress: prebuildsInProgress} +} + +func (s reconciliationState) filterByPreset(presetID uuid.UUID) (*presetState, error) { + preset, found := slice.Find(s.presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool { + return preset.PresetID == presetID + }) + if !found { + return nil, xerrors.Errorf("no preset found with ID %q", presetID) + } + + running := slice.Filter(s.runningPrebuilds, func(prebuild database.GetRunningPrebuildsRow) bool { + if !prebuild.DesiredPresetID.Valid && !prebuild.CurrentPresetID.Valid { + return false + } + return prebuild.CurrentPresetID.UUID == preset.PresetID && + prebuild.TemplateVersionID == preset.TemplateVersionID // Not strictly necessary since presets are 1:1 with template versions, but no harm in being extra safe. + }) + + // These aren't preset-specific, but they need to inhibit all presets of this template from operating since they could + // be in-progress builds which might impact another preset. For example, if a template goes from no defined prebuilds to defined prebuilds + // and back, or a template is updated from one version to another. + inProgress := slice.Filter(s.prebuildsInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool { + return prebuild.TemplateVersionID == preset.TemplateVersionID + }) + + return &presetState{ + preset: preset, + running: running, + inProgress: inProgress, + }, nil +} + +func (p presetState) calculateActions() (*reconciliationActions, error) { + // TODO: align workspace states with how we represent them on the FE and the CLI + // right now there's some slight differences which can lead to additional prebuilds being created + + // TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is + // about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races! + + var ( + actual int32 // Running prebuilds for active version. + desired int32 // Active template version's desired instances as defined in preset. + eligible int32 // Prebuilds which can be claimed. + outdated int32 // Prebuilds which no longer match the active template version. + extraneous int32 // Extra running prebuilds for active version (somehow). + starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down. + ) + + if p.preset.UsingActiveVersion { + actual = int32(len(p.running)) + desired = p.preset.DesiredInstances + } + + for _, prebuild := range p.running { + if p.preset.UsingActiveVersion { + if prebuild.Ready { + eligible++ + } + + extraneous = int32(math.Max(float64(actual-p.preset.DesiredInstances), 0)) + } + + if prebuild.TemplateVersionID == p.preset.TemplateVersionID && !p.preset.UsingActiveVersion { + outdated++ + } + } + + for _, progress := range p.inProgress { + switch progress.Transition { + case database.WorkspaceTransitionStart: + starting++ + case database.WorkspaceTransitionStop: + stopping++ + case database.WorkspaceTransitionDelete: + deleting++ + } + } + + var ( + toCreate = int(math.Max(0, float64( + desired- // The number specified in the preset + (actual+starting)- // The current number of prebuilds (or builds in-flight) + stopping), // The number of prebuilds currently being stopped (should be 0) + )) + toDelete = int(math.Max(0, float64( + outdated- // The number of prebuilds running above the desired count for active version + deleting), // The number of prebuilds currently being deleted + )) + + actions = &reconciliationActions{ + actual: actual, + desired: desired, + eligible: eligible, + outdated: outdated, + extraneous: extraneous, + starting: starting, + stopping: stopping, + deleting: deleting, + } + ) + + // Bail early to avoid scheduling new prebuilds while operations are in progress. + if (toCreate+toDelete) > 0 && (starting+stopping+deleting) > 0 { + // TODO: move up + //c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation", + // slog.F("template_id", p.preset.TemplateID.String()), slog.F("starting", starting), + // slog.F("stopping", stopping), slog.F("deleting", deleting), + // slog.F("wanted_to_create", create), slog.F("wanted_to_delete", toDelete)) + return actions, nil + } + + // It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so + // we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted. + if extraneous > 0 { + // Sort running IDs by creation time so we always delete the oldest prebuilds. + // In general, we want fresher prebuilds (imagine a mono-repo is cloned; newer is better). + slices.SortFunc(p.running, func(a, b database.GetRunningPrebuildsRow) int { + if a.CreatedAt.Before(b.CreatedAt) { + return -1 + } + if a.CreatedAt.After(b.CreatedAt) { + return 1 + } + + return 0 + }) + + var victims []uuid.UUID + for i := 0; i < int(extraneous); i++ { + if i >= len(p.running) { + // This should never happen. + // TODO: move up + //c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!", + // slog.F("running_count", len(p.running)), + // slog.F("extraneous", extraneous)) + continue + } + + victims = append(victims, p.running[i].WorkspaceID) + } + + actions.deleteIDs = append(actions.deleteIDs, victims...) + + // TODO: move up + //c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)", + // slog.F("template_id", p.preset.TemplateID.String()), slog.F("desired", desired), slog.F("actual", actual), slog.F("extra", extraneous), + // slog.F("victims", victims)) + + // Prevent the rest of the reconciliation from completing + return actions, nil + } + + // If the template has become deleted or deprecated since the last reconciliation, we need to ensure we + // scale those prebuilds down to zero. + if p.preset.Deleted || p.preset.Deprecated { + toCreate = 0 + toDelete = int(actual + outdated) + } + + actions.create = int32(toCreate) + + if toDelete > 0 && len(p.running) != toDelete { + // TODO: move up + //c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!", + // slog.F("template_id", s.preset.TemplateID.String()), slog.F("running", len(p.running)), slog.F("to_delete", toDelete)) + } + + // TODO: implement lookup to not perform same action on workspace multiple times in $period + // i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion + for i := 0; i < toDelete; i++ { + if i >= len(p.running) { + // TODO: move up + // Above warning will have already addressed this. + continue + } + + actions.deleteIDs = append(actions.deleteIDs, p.running[i].WorkspaceID) + } + + return actions, nil +}