diff --git a/enterprise/coderd/prebuilds/controller.go b/enterprise/coderd/prebuilds/controller.go index acb3c61de4..620227b2ba 100644 --- a/enterprise/coderd/prebuilds/controller.go +++ b/enterprise/coderd/prebuilds/controller.go @@ -93,6 +93,22 @@ func (c *Controller) ReconcileTemplate(templateID uuid.UUID) { c.nudgeCh <- &templateID } +// reconcile will attempt to resolve the desired vs actual state of all templates which have presets with prebuilds configured. +// +// NOTE: +// +// This function will kick of n provisioner jobs, based on the calculated state modifications. +// +// These provisioning jobs are fire-and-forget. We DO NOT wait for the prebuilt workspaces to complete their +// provisioning. As a consequence, it's possible that another reconciliation run will occur, which will mean that +// multiple preset versions could be reconciling at once. This may mean some temporary over-provisioning, but the +// reconciliation loop will bring these resources back into their desired numbers in an EVENTUALLY-consistent way. +// +// For example: we could decide to provision 1 new instance in this reconciliation. +// While that workspace is being provisioned, another template version is created which means this same preset will +// be reconciled again, leading to another workspace being provisioned. Two workspace builds will be occurring +// simultaneously for the same preset, but once both jobs have completed the reconciliation loop will notice the +// extraneous instance and delete it. func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) { var logger slog.Logger if templateID == nil { @@ -121,7 +137,7 @@ func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) { err := c.store.InTx(func(db database.Store) error { start := time.Now() - // TODO: give up after some time waiting on this? + // TODO: use TryAcquireLock here and bail out early. err := db.AcquireLock(ctx, database.LockIDReconcileTemplatePrebuilds) if err != nil { logger.Warn(ctx, "failed to acquire top-level prebuilds reconciliation lock; likely running on another coderd replica", slog.Error(err)) @@ -183,7 +199,7 @@ func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) { } // determineState determines the current state of prebuilds & the presets which define them. -// This function MUST be called within +// An application-level lock is used func (c *Controller) determineState(ctx context.Context, store database.Store, id uuid.NullUUID) (*reconciliationState, error) { if err := ctx.Err(); err != nil { return nil, err @@ -259,14 +275,15 @@ func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, ps *preset levelFn = vlogger.Info } levelFn(ctx, "template prebuild state retrieved", - slog.F("to_create", actions.create), slog.F("to_delete", len(actions.deleteIDs)), + slog.F("create_count", actions.create), slog.F("delete_count", len(actions.deleteIDs)), + slog.F("to_delete", actions.deleteIDs), slog.F("desired", actions.desired), slog.F("actual", actions.actual), slog.F("outdated", actions.outdated), slog.F("extraneous", actions.extraneous), slog.F("starting", actions.starting), slog.F("stopping", actions.stopping), slog.F("deleting", actions.deleting), slog.F("eligible", actions.eligible)) // Provision workspaces within the same tx so we don't get any timing issues here. - // i.e. we hold the advisory lock until all reconciliatory actions have been taken. + // i.e. we hold the advisory lock until all "reconciliatory" actions have been taken. // TODO: max per reconciliation iteration? // TODO: i've removed the surrounding tx, but if we restore it then we need to pass down the store to these funcs. diff --git a/enterprise/coderd/prebuilds/controller_test.go b/enterprise/coderd/prebuilds/controller_test.go deleted file mode 100644 index 55e6eada05..0000000000 --- a/enterprise/coderd/prebuilds/controller_test.go +++ /dev/null @@ -1,130 +0,0 @@ -package prebuilds - -import ( - "testing" - "time" - - "github.com/google/uuid" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/coder/coder/v2/coderd/database" -) - -var ( - templateID = uuid.New() - templateVersionID = uuid.New() - presetID = uuid.New() - preset2ID = uuid.New() - prebuildID = uuid.New() -) - -func TestReconciliationActions(t *testing.T) { - cases := map[string]struct { - preset database.GetTemplatePresetsWithPrebuildsRow // TODO: make own structs; reusing these types is lame - running []database.GetRunningPrebuildsRow - inProgress []database.GetPrebuildsInProgressRow - expected reconciliationActions - }{ - // New template version created which adds a new preset with prebuilds configured. - "CreateNetNew": { - preset: preset(true, 1), - expected: reconciliationActions{ - desired: 1, - create: 1, - }, - }, - // New template version created, making an existing preset and its prebuilds outdated. - "DeleteOutdated": { - preset: preset(false, 1), - running: []database.GetRunningPrebuildsRow{ - { - WorkspaceID: prebuildID, - TemplateID: templateID, - TemplateVersionID: templateVersionID, - CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true}, - DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}, - Ready: true, - }, - }, - expected: reconciliationActions{ - outdated: 1, - deleteIDs: []uuid.UUID{prebuildID}, - }, - }, - // Somehow an additional prebuild is running, delete it. - // This can happen if an operator messes with a prebuild's state (stop, start). - "DeleteOldestExtraneous": { - preset: preset(true, 1), - running: []database.GetRunningPrebuildsRow{ - { - WorkspaceID: prebuildID, - TemplateID: templateID, - TemplateVersionID: templateVersionID, - CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true}, - DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}, - CreatedAt: time.Now().Add(-time.Hour), - }, - { - WorkspaceID: uuid.New(), - TemplateID: templateID, - TemplateVersionID: templateVersionID, - CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true}, - DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true}, - CreatedAt: time.Now(), - }, - }, - expected: reconciliationActions{ - desired: 1, - extraneous: 1, - actual: 2, - deleteIDs: []uuid.UUID{prebuildID}, - }, - }, - } - - for name, tc := range cases { - t.Run(name, func(t *testing.T) { - t.Parallel() - - ps := presetState{ - preset: tc.preset, - running: tc.running, - inProgress: tc.inProgress, - } - - actions, err := ps.calculateActions() - require.NoError(t, err, "could not calculate reconciliation actions") - - validateActions(t, tc.expected, *actions) - }) - } -} - -func preset(active bool, instances int32) database.GetTemplatePresetsWithPrebuildsRow { - return database.GetTemplatePresetsWithPrebuildsRow{ - TemplateID: templateID, - TemplateVersionID: templateVersionID, - UsingActiveVersion: active, - PresetID: presetID, - Name: "bob", - DesiredInstances: instances, - Deleted: false, - Deprecated: false, - } -} - -// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for -// prebuilds align with zero values. -func validateActions(t *testing.T, expected, actual reconciliationActions) bool { - return assert.EqualValuesf(t, expected.deleteIDs, actual.deleteIDs, "'deleteIDs' did not match expectation") && - assert.EqualValuesf(t, expected.create, actual.create, "'create' did not match expectation") && - assert.EqualValuesf(t, expected.desired, actual.desired, "'desired' did not match expectation") && - assert.EqualValuesf(t, expected.actual, actual.actual, "'actual' did not match expectation") && - assert.EqualValuesf(t, expected.eligible, actual.eligible, "'eligible' did not match expectation") && - assert.EqualValuesf(t, expected.extraneous, actual.extraneous, "'extraneous' did not match expectation") && - assert.EqualValuesf(t, expected.outdated, actual.outdated, "'outdated' did not match expectation") && - assert.EqualValuesf(t, expected.starting, actual.starting, "'starting' did not match expectation") && - assert.EqualValuesf(t, expected.stopping, actual.stopping, "'stopping' did not match expectation") && - assert.EqualValuesf(t, expected.deleting, actual.deleting, "'deleting' did not match expectation") -} diff --git a/enterprise/coderd/prebuilds/reconciliation.go b/enterprise/coderd/prebuilds/reconciliation.go index 2482f1d050..fe5dede958 100644 --- a/enterprise/coderd/prebuilds/reconciliation.go +++ b/enterprise/coderd/prebuilds/reconciliation.go @@ -47,7 +47,7 @@ func (s reconciliationState) filterByPreset(presetID uuid.UUID) (*presetState, e } running := slice.Filter(s.runningPrebuilds, func(prebuild database.GetRunningPrebuildsRow) bool { - if !prebuild.DesiredPresetID.Valid && !prebuild.CurrentPresetID.Valid { + if !prebuild.CurrentPresetID.Valid { return false } return prebuild.CurrentPresetID.UUID == preset.PresetID && @@ -57,8 +57,10 @@ func (s reconciliationState) filterByPreset(presetID uuid.UUID) (*presetState, e // These aren't preset-specific, but they need to inhibit all presets of this template from operating since they could // be in-progress builds which might impact another preset. For example, if a template goes from no defined prebuilds to defined prebuilds // and back, or a template is updated from one version to another. + // We group by the template so that all prebuilds being provisioned for a prebuild are inhibited if any prebuild for + // any preset in that template are in progress, to prevent clobbering. inProgress := slice.Filter(s.prebuildsInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool { - return prebuild.TemplateVersionID == preset.TemplateVersionID + return prebuild.TemplateID == preset.TemplateID }) return &presetState{ @@ -103,6 +105,8 @@ func (p presetState) calculateActions() (*reconciliationActions, error) { } } + // In-progress builds are common across all presets belonging to a given template. + // In other words: these values will be identical across all presets belonging to this template. for _, progress := range p.inProgress { switch progress.Transition { case database.WorkspaceTransitionStart: @@ -138,6 +142,7 @@ func (p presetState) calculateActions() (*reconciliationActions, error) { ) // Bail early to avoid scheduling new prebuilds while operations are in progress. + // TODO: optimization: we should probably be able to create prebuilds while others are deleting for a given preset. if (toCreate+toDelete) > 0 && (starting+stopping+deleting) > 0 { // TODO: move up //c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation", diff --git a/enterprise/coderd/prebuilds/reconciliation_test.go b/enterprise/coderd/prebuilds/reconciliation_test.go new file mode 100644 index 0000000000..1efa43195d --- /dev/null +++ b/enterprise/coderd/prebuilds/reconciliation_test.go @@ -0,0 +1,400 @@ +package prebuilds + +import ( + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/database" +) + +type options struct { + templateID uuid.UUID + templateVersionID uuid.UUID + presetID uuid.UUID + presetName string + prebuildID uuid.UUID + workspaceName string +} + +// templateID is common across all option sets. +var templateID = uuid.New() + +const ( + optionSet0 = iota + optionSet1 + optionSet2 +) + +var opts = map[uint]options{ + optionSet0: { + templateID: templateID, + templateVersionID: uuid.New(), + presetID: uuid.New(), + presetName: "my-preset", + prebuildID: uuid.New(), + workspaceName: "prebuilds0", + }, + optionSet1: { + templateID: templateID, + templateVersionID: uuid.New(), + presetID: uuid.New(), + presetName: "my-preset", + prebuildID: uuid.New(), + workspaceName: "prebuilds1", + }, + optionSet2: { + templateID: templateID, + templateVersionID: uuid.New(), + presetID: uuid.New(), + presetName: "my-preset", + prebuildID: uuid.New(), + workspaceName: "prebuilds2", + }, +} + +// A new template version with a preset without prebuilds configured should result in no prebuilds being created. +func TestNoPrebuilds(t *testing.T) { + current := opts[optionSet0] + + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 0, current), + } + + state := newReconciliationState(presets, nil, nil) + ps, err := state.filterByPreset(current.presetID) + require.NoError(t, err) + + actions, err := ps.calculateActions() + require.NoError(t, err) + + validateActions(t, reconciliationActions{ /*all zero values*/ }, *actions) +} + +// A new template version with a preset with prebuilds configured should result in a new prebuild being created. +func TestNetNew(t *testing.T) { + current := opts[optionSet0] + + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current), + } + + state := newReconciliationState(presets, nil, nil) + ps, err := state.filterByPreset(current.presetID) + require.NoError(t, err) + + actions, err := ps.calculateActions() + require.NoError(t, err) + + validateActions(t, reconciliationActions{ + desired: 1, + create: 1, + }, *actions) +} + +// A new template version is created with a preset with prebuilds configured; this outdates the older version and +// requires the old prebuilds to be destroyed and new prebuilds to be created. +func TestOutdatedPrebuilds(t *testing.T) { + outdated := opts[optionSet0] + current := opts[optionSet1] + + // GIVEN: 2 presets, one outdated and one new. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(false, 1, outdated), + preset(true, 1, current), + } + + // GIVEN: a running prebuild for the outdated preset. + running := []database.GetRunningPrebuildsRow{ + prebuild(outdated), + } + + // GIVEN: no in-progress builds. + var inProgress []database.GetPrebuildsInProgressRow + + // WHEN: calculating the outdated preset's state. + state := newReconciliationState(presets, running, inProgress) + ps, err := state.filterByPreset(outdated.presetID) + require.NoError(t, err) + + // THEN: we should identify that this prebuild is outdated and needs to be deleted. + actions, err := ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{outdated: 1, deleteIDs: []uuid.UUID{outdated.prebuildID}}, *actions) + + // WHEN: calculating the current preset's state. + ps, err = state.filterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: we should not be blocked from creating a new prebuild while the outdate one deletes. + actions, err = ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{desired: 1, create: 1}, *actions) +} + +// A new template version is created with a preset with prebuilds configured; while the outdated prebuild is deleting, +// the new preset's prebuild cannot be provisioned concurrently, to prevent clobbering. +func TestBlockedOnDeleteActions(t *testing.T) { + outdated := opts[optionSet0] + current := opts[optionSet1] + + // GIVEN: 2 presets, one outdated and one new. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(false, 1, outdated), + preset(true, 1, current), + } + + // GIVEN: a running prebuild for the outdated preset. + running := []database.GetRunningPrebuildsRow{ + prebuild(outdated), + } + + // GIVEN: one prebuild for the old preset which is currently deleting. + inProgress := []database.GetPrebuildsInProgressRow{ + { + TemplateID: outdated.templateID, + TemplateVersionID: outdated.templateVersionID, + Transition: database.WorkspaceTransitionDelete, + Count: 1, + }, + } + + // WHEN: calculating the outdated preset's state. + state := newReconciliationState(presets, running, inProgress) + ps, err := state.filterByPreset(outdated.presetID) + require.NoError(t, err) + + // THEN: we should identify that this prebuild is in progress, and not attempt to delete this prebuild again. + actions, err := ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{outdated: 1, deleting: 1}, *actions) + + // WHEN: calculating the current preset's state. + ps, err = state.filterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: we are blocked from creating a new prebuild while another one is busy provisioning. + actions, err = ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{desired: 1, create: 0, deleting: 1}, *actions) +} + +// A new template version is created with a preset with prebuilds configured. An operator comes along and stops one of the +// running prebuilds (this shouldn't be done, but it's possible). While this prebuild is stopping, all other prebuild +// actions are blocked. +func TestBlockedOnStopActions(t *testing.T) { + outdated := opts[optionSet0] + current := opts[optionSet1] + + // GIVEN: 2 presets, one outdated and one new (which now expects 2 prebuilds!). + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(false, 1, outdated), + preset(true, 2, current), + } + + // GIVEN: NO running prebuilds for either preset. + var running []database.GetRunningPrebuildsRow + + // GIVEN: one prebuild for the old preset which is currently stopping. + inProgress := []database.GetPrebuildsInProgressRow{ + { + TemplateID: outdated.templateID, + TemplateVersionID: outdated.templateVersionID, + Transition: database.WorkspaceTransitionStop, + Count: 1, + }, + } + + // WHEN: calculating the outdated preset's state. + state := newReconciliationState(presets, running, inProgress) + ps, err := state.filterByPreset(outdated.presetID) + require.NoError(t, err) + + // THEN: there is nothing to do. + actions, err := ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{stopping: 1}, *actions) + + // WHEN: calculating the current preset's state. + ps, err = state.filterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: we are blocked from creating a new prebuild while another one is busy provisioning. + actions, err = ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{desired: 2, stopping: 1, create: 0}, *actions) +} + +// A new template version is created with a preset with prebuilds configured; the outdated prebuilds are deleted, +// and one of the new prebuilds is already being provisioned, but we bail out early if operations are already in progress +// for this prebuild - to prevent clobbering. +func TestBlockedOnStartActions(t *testing.T) { + outdated := opts[optionSet0] + current := opts[optionSet1] + + // GIVEN: 2 presets, one outdated and one new (which now expects 2 prebuilds!). + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(false, 1, outdated), + preset(true, 2, current), + } + + // GIVEN: NO running prebuilds for either preset. + var running []database.GetRunningPrebuildsRow + + // GIVEN: one prebuild for the old preset which is currently provisioning. + inProgress := []database.GetPrebuildsInProgressRow{ + { + TemplateID: current.templateID, + TemplateVersionID: current.templateVersionID, + Transition: database.WorkspaceTransitionStart, + Count: 1, + }, + } + + // WHEN: calculating the outdated preset's state. + state := newReconciliationState(presets, running, inProgress) + ps, err := state.filterByPreset(outdated.presetID) + require.NoError(t, err) + + // THEN: there is nothing to do. + actions, err := ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{starting: 1}, *actions) + + // WHEN: calculating the current preset's state. + ps, err = state.filterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: we are blocked from creating a new prebuild while another one is busy provisioning. + actions, err = ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{desired: 2, starting: 1, create: 0}, *actions) +} + +// Additional prebuilds exist for a given preset configuration; these must be deleted. +func TestExtraneous(t *testing.T) { + current := opts[optionSet0] + + // GIVEN: a preset with 1 desired prebuild. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current), + } + + var older uuid.UUID + // GIVEN: 2 running prebuilds for the preset. + running := []database.GetRunningPrebuildsRow{ + prebuild(current, func(row database.GetRunningPrebuildsRow) database.GetRunningPrebuildsRow { + // The older of the running prebuilds will be deleted in order to maintain freshness. + row.CreatedAt = time.Now().Add(-time.Hour) + older = row.WorkspaceID + return row + }), + prebuild(current, func(row database.GetRunningPrebuildsRow) database.GetRunningPrebuildsRow { + row.CreatedAt = time.Now() + return row + }), + } + + // GIVEN: NO prebuilds in progress. + var inProgress []database.GetPrebuildsInProgressRow + + // WHEN: calculating the current preset's state. + state := newReconciliationState(presets, running, inProgress) + ps, err := state.filterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: an extraneous prebuild is detected and marked for deletion. + actions, err := ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{ + actual: 2, desired: 1, extraneous: 1, deleteIDs: []uuid.UUID{older}, eligible: 2, + }, *actions) +} + +// As above, but no actions will be performed because +func TestExtraneousInProgress(t *testing.T) { + current := opts[optionSet0] + + // GIVEN: a preset with 1 desired prebuild. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current), + } + + var older uuid.UUID + // GIVEN: 2 running prebuilds for the preset. + running := []database.GetRunningPrebuildsRow{ + prebuild(current, func(row database.GetRunningPrebuildsRow) database.GetRunningPrebuildsRow { + // The older of the running prebuilds will be deleted in order to maintain freshness. + row.CreatedAt = time.Now().Add(-time.Hour) + older = row.WorkspaceID + return row + }), + prebuild(current, func(row database.GetRunningPrebuildsRow) database.GetRunningPrebuildsRow { + row.CreatedAt = time.Now() + return row + }), + } + + // GIVEN: NO prebuilds in progress. + var inProgress []database.GetPrebuildsInProgressRow + + // WHEN: calculating the current preset's state. + state := newReconciliationState(presets, running, inProgress) + ps, err := state.filterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: an extraneous prebuild is detected and marked for deletion. + actions, err := ps.calculateActions() + require.NoError(t, err) + validateActions(t, reconciliationActions{ + actual: 2, desired: 1, extraneous: 1, deleteIDs: []uuid.UUID{older}, eligible: 2, + }, *actions) +} + +func preset(active bool, instances int32, opts options) database.GetTemplatePresetsWithPrebuildsRow { + return database.GetTemplatePresetsWithPrebuildsRow{ + TemplateID: opts.templateID, + TemplateVersionID: opts.templateVersionID, + PresetID: opts.presetID, + UsingActiveVersion: active, + Name: opts.presetName, + DesiredInstances: instances, + Deleted: false, + Deprecated: false, + } +} + +func prebuild(opts options, muts ...func(row database.GetRunningPrebuildsRow) database.GetRunningPrebuildsRow) database.GetRunningPrebuildsRow { + entry := database.GetRunningPrebuildsRow{ + WorkspaceID: opts.prebuildID, + WorkspaceName: opts.workspaceName, + TemplateID: opts.templateID, + TemplateVersionID: opts.templateVersionID, + CurrentPresetID: uuid.NullUUID{UUID: opts.presetID, Valid: true}, + Ready: true, + CreatedAt: time.Now(), + } + + for _, mut := range muts { + entry = mut(entry) + } + return entry +} + +// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for +// prebuilds align with zero values. +func validateActions(t *testing.T, expected, actual reconciliationActions) bool { + return assert.EqualValuesf(t, expected.deleteIDs, actual.deleteIDs, "'deleteIDs' did not match expectation") && + assert.EqualValuesf(t, expected.create, actual.create, "'create' did not match expectation") && + assert.EqualValuesf(t, expected.desired, actual.desired, "'desired' did not match expectation") && + assert.EqualValuesf(t, expected.actual, actual.actual, "'actual' did not match expectation") && + assert.EqualValuesf(t, expected.eligible, actual.eligible, "'eligible' did not match expectation") && + assert.EqualValuesf(t, expected.extraneous, actual.extraneous, "'extraneous' did not match expectation") && + assert.EqualValuesf(t, expected.outdated, actual.outdated, "'outdated' did not match expectation") && + assert.EqualValuesf(t, expected.starting, actual.starting, "'starting' did not match expectation") && + assert.EqualValuesf(t, expected.stopping, actual.stopping, "'stopping' did not match expectation") && + assert.EqualValuesf(t, expected.deleting, actual.deleting, "'deleting' did not match expectation") +}