mirror of
https://github.com/coder/coder.git
synced 2025-07-15 22:20:27 +00:00
Refactoring reconciliation loop into control & logic, adding initial (incomplete) tests
Signed-off-by: Danny Kopping <danny@coder.com>
This commit is contained in:
@ -13,6 +13,7 @@ const (
|
|||||||
LockIDNotificationsReportGenerator
|
LockIDNotificationsReportGenerator
|
||||||
LockIDCryptoKeyRotation
|
LockIDCryptoKeyRotation
|
||||||
LockIDReconcileTemplatePrebuilds
|
LockIDReconcileTemplatePrebuilds
|
||||||
|
LockIDDeterminePrebuildsState
|
||||||
)
|
)
|
||||||
|
|
||||||
// GenLockID generates a unique and consistent lock ID from a given string.
|
// GenLockID generates a unique and consistent lock ID from a given string.
|
||||||
|
@ -6,18 +6,15 @@ import (
|
|||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/base32"
|
"encoding/base32"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
|
||||||
mrand "math/rand"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/coder/coder/v2/coderd/util/slice"
|
|
||||||
"github.com/hashicorp/go-multierror"
|
"github.com/hashicorp/go-multierror"
|
||||||
|
|
||||||
"golang.org/x/exp/slices"
|
|
||||||
|
|
||||||
"github.com/coder/coder/v2/coderd/audit"
|
"github.com/coder/coder/v2/coderd/audit"
|
||||||
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||||
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
|
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
|
||||||
"github.com/coder/coder/v2/coderd/database/pubsub"
|
"github.com/coder/coder/v2/coderd/database/pubsub"
|
||||||
@ -31,9 +28,6 @@ import (
|
|||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
"github.com/coder/coder/v2/coderd/database"
|
|
||||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type Controller struct {
|
type Controller struct {
|
||||||
@ -134,39 +128,41 @@ func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
defer logger.Debug(ctx, "acquired top-level prebuilds reconciliation lock", slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", time.Since(start).Seconds())))
|
logger.Debug(ctx, "acquired top-level prebuilds reconciliation lock", slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", time.Since(start).Seconds())))
|
||||||
|
|
||||||
innerCtx, cancel := context.WithTimeout(ctx, time.Second*30)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
var id uuid.NullUUID
|
var id uuid.NullUUID
|
||||||
if templateID != nil {
|
if templateID != nil {
|
||||||
id.UUID = *templateID
|
id.UUID = *templateID
|
||||||
|
id.Valid = true
|
||||||
}
|
}
|
||||||
|
|
||||||
presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, id)
|
state, err := c.determineState(ctx, db, id)
|
||||||
if len(presetsWithPrebuilds) == 0 {
|
if err != nil {
|
||||||
logger.Debug(innerCtx, "no templates found with prebuilds configured")
|
return xerrors.Errorf("determine current state: %w", err)
|
||||||
|
}
|
||||||
|
if len(state.presets) == 0 {
|
||||||
|
logger.Debug(ctx, "no templates found with prebuilds configured")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
runningPrebuilds, err := db.GetRunningPrebuilds(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return xerrors.Errorf("failed to get running prebuilds: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
prebuildsInProgress, err := db.GetPrebuildsInProgress(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: bounded concurrency? probably not but consider
|
// TODO: bounded concurrency? probably not but consider
|
||||||
var eg errgroup.Group
|
var eg errgroup.Group
|
||||||
for _, preset := range presetsWithPrebuilds {
|
for _, preset := range state.presets {
|
||||||
|
ps, err := state.filterByPreset(preset.PresetID)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warn(ctx, "failed to find preset state", slog.Error(err), slog.F("preset_id", preset.PresetID.String()))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if !preset.UsingActiveVersion && len(ps.running) == 0 && len(ps.inProgress) == 0 {
|
||||||
|
logger.Debug(ctx, "skipping reconciliation for preset; inactive, no running prebuilds, and no in-progress operationss",
|
||||||
|
slog.F("preset_id", preset.PresetID.String()))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
eg.Go(func() error {
|
eg.Go(func() error {
|
||||||
// Pass outer context.
|
// Pass outer context.
|
||||||
// TODO: name these better to avoid the comment.
|
err := c.reconcilePrebuildsForPreset(ctx, ps)
|
||||||
err := c.reconcilePrebuildsForPreset(ctx, preset, runningPrebuilds, prebuildsInProgress)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error(ctx, "failed to reconcile prebuilds for preset", slog.Error(err), slog.F("preset_id", preset.PresetID))
|
logger.Error(ctx, "failed to reconcile prebuilds for preset", slog.Error(err), slog.F("preset_id", preset.PresetID))
|
||||||
}
|
}
|
||||||
@ -186,186 +182,64 @@ func (c *Controller) reconcile(ctx context.Context, templateID *uuid.UUID) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type reconciliationActions struct {
|
// determineState determines the current state of prebuilds & the presets which define them.
|
||||||
deleteIDs []uuid.UUID
|
// This function MUST be called within
|
||||||
createIDs []uuid.UUID
|
func (c *Controller) determineState(ctx context.Context, store database.Store, id uuid.NullUUID) (*reconciliationState, error) {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
actual int32 // Running prebuilds for active version.
|
var state reconciliationState
|
||||||
desired int32 // Active template version's desired instances as defined in preset.
|
|
||||||
eligible int32 // Prebuilds which can be claimed.
|
err := store.InTx(func(db database.Store) error {
|
||||||
outdated int32 // Prebuilds which no longer match the active template version.
|
start := time.Now()
|
||||||
extraneous int32 // Extra running prebuilds for active version (somehow).
|
|
||||||
starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down.
|
// TODO: give up after some time waiting on this?
|
||||||
|
err := db.AcquireLock(ctx, database.LockIDDeterminePrebuildsState)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.Errorf("failed to acquire state determination lock: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.logger.Debug(ctx, "acquired state determination lock", slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", time.Since(start).Seconds())))
|
||||||
|
|
||||||
|
presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, id)
|
||||||
|
if len(presetsWithPrebuilds) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
allRunningPrebuilds, err := db.GetRunningPrebuilds(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.Errorf("failed to get running prebuilds: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
allPrebuildsInProgress, err := db.GetPrebuildsInProgress(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.Errorf("failed to get prebuilds in progress: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
state = newReconciliationState(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress)
|
||||||
|
return nil
|
||||||
|
}, &database.TxOptions{
|
||||||
|
Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs
|
||||||
|
ReadOnly: true,
|
||||||
|
TxIdentifier: "prebuilds_state_determination",
|
||||||
|
})
|
||||||
|
|
||||||
|
return &state, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculateActions MUST be called within the context of a transaction (TODO: isolation)
|
func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, ps *presetState) error {
|
||||||
// with an advisory lock to prevent TOCTOU races.
|
if ps == nil {
|
||||||
func (c *Controller) calculateActions(ctx context.Context, preset database.GetTemplatePresetsWithPrebuildsRow, running []database.GetRunningPrebuildsRow, inProgress []database.GetPrebuildsInProgressRow) (*reconciliationActions, error) {
|
return xerrors.Errorf("unexpected nil preset state")
|
||||||
// TODO: align workspace states with how we represent them on the FE and the CLI
|
|
||||||
// right now there's some slight differences which can lead to additional prebuilds being created
|
|
||||||
|
|
||||||
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
|
|
||||||
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
|
|
||||||
|
|
||||||
var (
|
|
||||||
actual int32 // Running prebuilds for active version.
|
|
||||||
desired int32 // Active template version's desired instances as defined in preset.
|
|
||||||
eligible int32 // Prebuilds which can be claimed.
|
|
||||||
outdated int32 // Prebuilds which no longer match the active template version.
|
|
||||||
extraneous int32 // Extra running prebuilds for active version (somehow).
|
|
||||||
starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down.
|
|
||||||
)
|
|
||||||
|
|
||||||
if preset.UsingActiveVersion {
|
|
||||||
actual = int32(len(running))
|
|
||||||
desired = preset.DesiredInstances
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, prebuild := range running {
|
logger := c.logger.With(slog.F("template_id", ps.preset.TemplateID.String()))
|
||||||
if preset.UsingActiveVersion {
|
|
||||||
if prebuild.Eligible {
|
|
||||||
eligible++
|
|
||||||
}
|
|
||||||
|
|
||||||
extraneous = int32(math.Max(float64(actual-preset.DesiredInstances), 0))
|
|
||||||
}
|
|
||||||
|
|
||||||
if prebuild.TemplateVersionID == preset.TemplateVersionID && !preset.UsingActiveVersion {
|
|
||||||
outdated++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, progress := range inProgress {
|
|
||||||
switch progress.Transition {
|
|
||||||
case database.WorkspaceTransitionStart:
|
|
||||||
starting++
|
|
||||||
case database.WorkspaceTransitionStop:
|
|
||||||
stopping++
|
|
||||||
case database.WorkspaceTransitionDelete:
|
|
||||||
deleting++
|
|
||||||
default:
|
|
||||||
c.logger.Warn(ctx, "unknown transition found in prebuilds in progress result", slog.F("transition", progress.Transition))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
toCreate = int(math.Max(0, float64(
|
|
||||||
desired- // The number specified in the preset
|
|
||||||
(actual+starting)- // The current number of prebuilds (or builds in-flight)
|
|
||||||
stopping), // The number of prebuilds currently being stopped (should be 0)
|
|
||||||
))
|
|
||||||
toDelete = int(math.Max(0, float64(
|
|
||||||
outdated- // The number of prebuilds running above the desired count for active version
|
|
||||||
deleting), // The number of prebuilds currently being deleted
|
|
||||||
))
|
|
||||||
|
|
||||||
actions = &reconciliationActions{
|
|
||||||
actual: actual,
|
|
||||||
desired: desired,
|
|
||||||
eligible: eligible,
|
|
||||||
outdated: outdated,
|
|
||||||
extraneous: extraneous,
|
|
||||||
starting: starting,
|
|
||||||
stopping: stopping,
|
|
||||||
deleting: deleting,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
// Bail early to avoid scheduling new prebuilds while operations are in progress.
|
|
||||||
if (toCreate+toDelete) > 0 && (starting+stopping+deleting) > 0 {
|
|
||||||
c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation",
|
|
||||||
slog.F("template_id", preset.TemplateID.String()), slog.F("starting", starting),
|
|
||||||
slog.F("stopping", stopping), slog.F("deleting", deleting),
|
|
||||||
slog.F("wanted_to_create", toCreate), slog.F("wanted_to_delete", toDelete))
|
|
||||||
return actions, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
|
|
||||||
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
|
|
||||||
if extraneous > 0 {
|
|
||||||
// Sort running IDs randomly so we can pick random victims.
|
|
||||||
slices.SortFunc(running, func(_, _ database.GetRunningPrebuildsRow) int {
|
|
||||||
if mrand.Float64() > 0.5 {
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1
|
|
||||||
})
|
|
||||||
|
|
||||||
var victims []uuid.UUID
|
|
||||||
for i := 0; i < int(extraneous); i++ {
|
|
||||||
if i >= len(running) {
|
|
||||||
// This should never happen.
|
|
||||||
c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!",
|
|
||||||
slog.F("running_count", len(running)),
|
|
||||||
slog.F("extraneous", extraneous))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
victims = append(victims, running[i].WorkspaceID)
|
|
||||||
}
|
|
||||||
|
|
||||||
actions.deleteIDs = append(actions.deleteIDs, victims...)
|
|
||||||
|
|
||||||
c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)",
|
|
||||||
slog.F("template_id", preset.TemplateID.String()), slog.F("desired", desired), slog.F("actual", actual), slog.F("extra", extraneous),
|
|
||||||
slog.F("victims", victims))
|
|
||||||
|
|
||||||
// Prevent the rest of the reconciliation from completing
|
|
||||||
return actions, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the template has become deleted or deprecated since the last reconciliation, we need to ensure we
|
|
||||||
// scale those prebuilds down to zero.
|
|
||||||
if preset.Deleted || preset.Deprecated {
|
|
||||||
toCreate = 0
|
|
||||||
toDelete = int(actual + outdated)
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := 0; i < toCreate; i++ {
|
|
||||||
actions.createIDs = append(actions.createIDs, uuid.New())
|
|
||||||
}
|
|
||||||
|
|
||||||
if toDelete > 0 && len(running) != toDelete {
|
|
||||||
c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!",
|
|
||||||
slog.F("template_id", preset.TemplateID.String()), slog.F("running", len(running)), slog.F("to_delete", toDelete))
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: implement lookup to not perform same action on workspace multiple times in $period
|
|
||||||
// i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion
|
|
||||||
for i := 0; i < toDelete; i++ {
|
|
||||||
if i >= len(running) {
|
|
||||||
// Above warning will have already addressed this.
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
actions.deleteIDs = append(actions.deleteIDs, running[i].WorkspaceID)
|
|
||||||
}
|
|
||||||
|
|
||||||
return actions, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, preset database.GetTemplatePresetsWithPrebuildsRow,
|
|
||||||
allRunning []database.GetRunningPrebuildsRow, allInProgress []database.GetPrebuildsInProgressRow,
|
|
||||||
) error {
|
|
||||||
logger := c.logger.With(slog.F("template_id", preset.TemplateID.String()))
|
|
||||||
|
|
||||||
var lastErr multierror.Error
|
var lastErr multierror.Error
|
||||||
vlogger := logger.With(slog.F("template_version_id", preset.TemplateVersionID), slog.F("preset_id", preset.PresetID))
|
vlogger := logger.With(slog.F("template_version_id", ps.preset.TemplateVersionID), slog.F("preset_id", ps.preset.PresetID))
|
||||||
|
|
||||||
running := slice.Filter(allRunning, func(prebuild database.GetRunningPrebuildsRow) bool {
|
// TODO: move log lines up from calculateActions.
|
||||||
if !prebuild.DesiredPresetID.Valid && !prebuild.CurrentPresetID.Valid {
|
actions, err := ps.calculateActions()
|
||||||
return false
|
|
||||||
}
|
|
||||||
return prebuild.CurrentPresetID.UUID == preset.PresetID &&
|
|
||||||
prebuild.TemplateVersionID == preset.TemplateVersionID // Not strictly necessary since presets are 1:1 with template versions, but no harm in being extra safe.
|
|
||||||
})
|
|
||||||
|
|
||||||
inProgress := slice.Filter(allInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool {
|
|
||||||
return prebuild.TemplateVersionID == preset.TemplateVersionID
|
|
||||||
})
|
|
||||||
|
|
||||||
actions, err := c.calculateActions(ctx, preset, running, inProgress)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
vlogger.Error(ctx, "failed to calculate reconciliation actions", slog.Error(err))
|
vlogger.Error(ctx, "failed to calculate reconciliation actions", slog.Error(err))
|
||||||
return xerrors.Errorf("failed to calculate reconciliation actions: %w", err)
|
return xerrors.Errorf("failed to calculate reconciliation actions: %w", err)
|
||||||
@ -380,12 +254,12 @@ func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, preset dat
|
|||||||
})
|
})
|
||||||
|
|
||||||
levelFn := vlogger.Debug
|
levelFn := vlogger.Debug
|
||||||
if len(actions.createIDs) > 0 || len(actions.deleteIDs) > 0 {
|
if actions.create > 0 || len(actions.deleteIDs) > 0 {
|
||||||
// Only log with info level when there's a change that needs to be effected.
|
// Only log with info level when there's a change that needs to be effected.
|
||||||
levelFn = vlogger.Info
|
levelFn = vlogger.Info
|
||||||
}
|
}
|
||||||
levelFn(ctx, "template prebuild state retrieved",
|
levelFn(ctx, "template prebuild state retrieved",
|
||||||
slog.F("to_create", len(actions.createIDs)), slog.F("to_delete", len(actions.deleteIDs)),
|
slog.F("to_create", actions.create), slog.F("to_delete", len(actions.deleteIDs)),
|
||||||
slog.F("desired", actions.desired), slog.F("actual", actions.actual),
|
slog.F("desired", actions.desired), slog.F("actual", actions.actual),
|
||||||
slog.F("outdated", actions.outdated), slog.F("extraneous", actions.extraneous),
|
slog.F("outdated", actions.outdated), slog.F("extraneous", actions.extraneous),
|
||||||
slog.F("starting", actions.starting), slog.F("stopping", actions.stopping),
|
slog.F("starting", actions.starting), slog.F("stopping", actions.stopping),
|
||||||
@ -396,15 +270,15 @@ func (c *Controller) reconcilePrebuildsForPreset(ctx context.Context, preset dat
|
|||||||
// TODO: max per reconciliation iteration?
|
// TODO: max per reconciliation iteration?
|
||||||
|
|
||||||
// TODO: i've removed the surrounding tx, but if we restore it then we need to pass down the store to these funcs.
|
// TODO: i've removed the surrounding tx, but if we restore it then we need to pass down the store to these funcs.
|
||||||
for _, id := range actions.createIDs {
|
for range actions.create {
|
||||||
if err := c.createPrebuild(ownerCtx, id, preset.TemplateID, preset.PresetID); err != nil {
|
if err := c.createPrebuild(ownerCtx, uuid.New(), ps.preset.TemplateID, ps.preset.PresetID); err != nil {
|
||||||
vlogger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
vlogger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
||||||
lastErr.Errors = append(lastErr.Errors, err)
|
lastErr.Errors = append(lastErr.Errors, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, id := range actions.deleteIDs {
|
for _, id := range actions.deleteIDs {
|
||||||
if err := c.deletePrebuild(ownerCtx, id, preset.TemplateID, preset.PresetID); err != nil {
|
if err := c.deletePrebuild(ownerCtx, id, ps.preset.TemplateID, ps.preset.PresetID); err != nil {
|
||||||
vlogger.Error(ctx, "failed to delete prebuild", slog.Error(err))
|
vlogger.Error(ctx, "failed to delete prebuild", slog.Error(err))
|
||||||
lastErr.Errors = append(lastErr.Errors, err)
|
lastErr.Errors = append(lastErr.Errors, err)
|
||||||
}
|
}
|
||||||
|
130
enterprise/coderd/prebuilds/controller_test.go
Normal file
130
enterprise/coderd/prebuilds/controller_test.go
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
package prebuilds
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
templateID = uuid.New()
|
||||||
|
templateVersionID = uuid.New()
|
||||||
|
presetID = uuid.New()
|
||||||
|
preset2ID = uuid.New()
|
||||||
|
prebuildID = uuid.New()
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestReconciliationActions(t *testing.T) {
|
||||||
|
cases := map[string]struct {
|
||||||
|
preset database.GetTemplatePresetsWithPrebuildsRow // TODO: make own structs; reusing these types is lame
|
||||||
|
running []database.GetRunningPrebuildsRow
|
||||||
|
inProgress []database.GetPrebuildsInProgressRow
|
||||||
|
expected reconciliationActions
|
||||||
|
}{
|
||||||
|
// New template version created which adds a new preset with prebuilds configured.
|
||||||
|
"CreateNetNew": {
|
||||||
|
preset: preset(true, 1),
|
||||||
|
expected: reconciliationActions{
|
||||||
|
desired: 1,
|
||||||
|
create: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// New template version created, making an existing preset and its prebuilds outdated.
|
||||||
|
"DeleteOutdated": {
|
||||||
|
preset: preset(false, 1),
|
||||||
|
running: []database.GetRunningPrebuildsRow{
|
||||||
|
{
|
||||||
|
WorkspaceID: prebuildID,
|
||||||
|
TemplateID: templateID,
|
||||||
|
TemplateVersionID: templateVersionID,
|
||||||
|
CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true},
|
||||||
|
DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true},
|
||||||
|
Ready: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: reconciliationActions{
|
||||||
|
outdated: 1,
|
||||||
|
deleteIDs: []uuid.UUID{prebuildID},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Somehow an additional prebuild is running, delete it.
|
||||||
|
// This can happen if an operator messes with a prebuild's state (stop, start).
|
||||||
|
"DeleteOldestExtraneous": {
|
||||||
|
preset: preset(true, 1),
|
||||||
|
running: []database.GetRunningPrebuildsRow{
|
||||||
|
{
|
||||||
|
WorkspaceID: prebuildID,
|
||||||
|
TemplateID: templateID,
|
||||||
|
TemplateVersionID: templateVersionID,
|
||||||
|
CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true},
|
||||||
|
DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true},
|
||||||
|
CreatedAt: time.Now().Add(-time.Hour),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
WorkspaceID: uuid.New(),
|
||||||
|
TemplateID: templateID,
|
||||||
|
TemplateVersionID: templateVersionID,
|
||||||
|
CurrentPresetID: uuid.NullUUID{UUID: presetID, Valid: true},
|
||||||
|
DesiredPresetID: uuid.NullUUID{UUID: uuid.New(), Valid: true},
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: reconciliationActions{
|
||||||
|
desired: 1,
|
||||||
|
extraneous: 1,
|
||||||
|
actual: 2,
|
||||||
|
deleteIDs: []uuid.UUID{prebuildID},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
ps := presetState{
|
||||||
|
preset: tc.preset,
|
||||||
|
running: tc.running,
|
||||||
|
inProgress: tc.inProgress,
|
||||||
|
}
|
||||||
|
|
||||||
|
actions, err := ps.calculateActions()
|
||||||
|
require.NoError(t, err, "could not calculate reconciliation actions")
|
||||||
|
|
||||||
|
validateActions(t, tc.expected, *actions)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func preset(active bool, instances int32) database.GetTemplatePresetsWithPrebuildsRow {
|
||||||
|
return database.GetTemplatePresetsWithPrebuildsRow{
|
||||||
|
TemplateID: templateID,
|
||||||
|
TemplateVersionID: templateVersionID,
|
||||||
|
UsingActiveVersion: active,
|
||||||
|
PresetID: presetID,
|
||||||
|
Name: "bob",
|
||||||
|
DesiredInstances: instances,
|
||||||
|
Deleted: false,
|
||||||
|
Deprecated: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for
|
||||||
|
// prebuilds align with zero values.
|
||||||
|
func validateActions(t *testing.T, expected, actual reconciliationActions) bool {
|
||||||
|
return assert.EqualValuesf(t, expected.deleteIDs, actual.deleteIDs, "'deleteIDs' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.create, actual.create, "'create' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.desired, actual.desired, "'desired' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.actual, actual.actual, "'actual' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.eligible, actual.eligible, "'eligible' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.extraneous, actual.extraneous, "'extraneous' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.outdated, actual.outdated, "'outdated' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.starting, actual.starting, "'starting' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.stopping, actual.stopping, "'stopping' did not match expectation") &&
|
||||||
|
assert.EqualValuesf(t, expected.deleting, actual.deleting, "'deleting' did not match expectation")
|
||||||
|
}
|
219
enterprise/coderd/prebuilds/reconciliation.go
Normal file
219
enterprise/coderd/prebuilds/reconciliation.go
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
package prebuilds
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"slices"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
|
"github.com/coder/coder/v2/coderd/util/slice"
|
||||||
|
)
|
||||||
|
|
||||||
|
type reconciliationState struct {
|
||||||
|
presets []database.GetTemplatePresetsWithPrebuildsRow
|
||||||
|
runningPrebuilds []database.GetRunningPrebuildsRow
|
||||||
|
prebuildsInProgress []database.GetPrebuildsInProgressRow
|
||||||
|
}
|
||||||
|
|
||||||
|
type presetState struct {
|
||||||
|
preset database.GetTemplatePresetsWithPrebuildsRow
|
||||||
|
running []database.GetRunningPrebuildsRow
|
||||||
|
inProgress []database.GetPrebuildsInProgressRow
|
||||||
|
}
|
||||||
|
|
||||||
|
type reconciliationActions struct {
|
||||||
|
actual int32 // Running prebuilds for active version.
|
||||||
|
desired int32 // Active template version's desired instances as defined in preset.
|
||||||
|
eligible int32 // Prebuilds which can be claimed.
|
||||||
|
outdated int32 // Prebuilds which no longer match the active template version.
|
||||||
|
extraneous int32 // Extra running prebuilds for active version (somehow).
|
||||||
|
starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down.
|
||||||
|
create int32 // The number of prebuilds required to be created to reconcile required state.
|
||||||
|
deleteIDs []uuid.UUID // IDs of running prebuilds required to be deleted to reconcile required state.
|
||||||
|
}
|
||||||
|
|
||||||
|
func newReconciliationState(presets []database.GetTemplatePresetsWithPrebuildsRow, runningPrebuilds []database.GetRunningPrebuildsRow, prebuildsInProgress []database.GetPrebuildsInProgressRow) reconciliationState {
|
||||||
|
return reconciliationState{presets: presets, runningPrebuilds: runningPrebuilds, prebuildsInProgress: prebuildsInProgress}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s reconciliationState) filterByPreset(presetID uuid.UUID) (*presetState, error) {
|
||||||
|
preset, found := slice.Find(s.presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool {
|
||||||
|
return preset.PresetID == presetID
|
||||||
|
})
|
||||||
|
if !found {
|
||||||
|
return nil, xerrors.Errorf("no preset found with ID %q", presetID)
|
||||||
|
}
|
||||||
|
|
||||||
|
running := slice.Filter(s.runningPrebuilds, func(prebuild database.GetRunningPrebuildsRow) bool {
|
||||||
|
if !prebuild.DesiredPresetID.Valid && !prebuild.CurrentPresetID.Valid {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return prebuild.CurrentPresetID.UUID == preset.PresetID &&
|
||||||
|
prebuild.TemplateVersionID == preset.TemplateVersionID // Not strictly necessary since presets are 1:1 with template versions, but no harm in being extra safe.
|
||||||
|
})
|
||||||
|
|
||||||
|
// These aren't preset-specific, but they need to inhibit all presets of this template from operating since they could
|
||||||
|
// be in-progress builds which might impact another preset. For example, if a template goes from no defined prebuilds to defined prebuilds
|
||||||
|
// and back, or a template is updated from one version to another.
|
||||||
|
inProgress := slice.Filter(s.prebuildsInProgress, func(prebuild database.GetPrebuildsInProgressRow) bool {
|
||||||
|
return prebuild.TemplateVersionID == preset.TemplateVersionID
|
||||||
|
})
|
||||||
|
|
||||||
|
return &presetState{
|
||||||
|
preset: preset,
|
||||||
|
running: running,
|
||||||
|
inProgress: inProgress,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p presetState) calculateActions() (*reconciliationActions, error) {
|
||||||
|
// TODO: align workspace states with how we represent them on the FE and the CLI
|
||||||
|
// right now there's some slight differences which can lead to additional prebuilds being created
|
||||||
|
|
||||||
|
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
|
||||||
|
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
|
||||||
|
|
||||||
|
var (
|
||||||
|
actual int32 // Running prebuilds for active version.
|
||||||
|
desired int32 // Active template version's desired instances as defined in preset.
|
||||||
|
eligible int32 // Prebuilds which can be claimed.
|
||||||
|
outdated int32 // Prebuilds which no longer match the active template version.
|
||||||
|
extraneous int32 // Extra running prebuilds for active version (somehow).
|
||||||
|
starting, stopping, deleting int32 // Prebuilds currently being provisioned up or down.
|
||||||
|
)
|
||||||
|
|
||||||
|
if p.preset.UsingActiveVersion {
|
||||||
|
actual = int32(len(p.running))
|
||||||
|
desired = p.preset.DesiredInstances
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, prebuild := range p.running {
|
||||||
|
if p.preset.UsingActiveVersion {
|
||||||
|
if prebuild.Ready {
|
||||||
|
eligible++
|
||||||
|
}
|
||||||
|
|
||||||
|
extraneous = int32(math.Max(float64(actual-p.preset.DesiredInstances), 0))
|
||||||
|
}
|
||||||
|
|
||||||
|
if prebuild.TemplateVersionID == p.preset.TemplateVersionID && !p.preset.UsingActiveVersion {
|
||||||
|
outdated++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, progress := range p.inProgress {
|
||||||
|
switch progress.Transition {
|
||||||
|
case database.WorkspaceTransitionStart:
|
||||||
|
starting++
|
||||||
|
case database.WorkspaceTransitionStop:
|
||||||
|
stopping++
|
||||||
|
case database.WorkspaceTransitionDelete:
|
||||||
|
deleting++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
toCreate = int(math.Max(0, float64(
|
||||||
|
desired- // The number specified in the preset
|
||||||
|
(actual+starting)- // The current number of prebuilds (or builds in-flight)
|
||||||
|
stopping), // The number of prebuilds currently being stopped (should be 0)
|
||||||
|
))
|
||||||
|
toDelete = int(math.Max(0, float64(
|
||||||
|
outdated- // The number of prebuilds running above the desired count for active version
|
||||||
|
deleting), // The number of prebuilds currently being deleted
|
||||||
|
))
|
||||||
|
|
||||||
|
actions = &reconciliationActions{
|
||||||
|
actual: actual,
|
||||||
|
desired: desired,
|
||||||
|
eligible: eligible,
|
||||||
|
outdated: outdated,
|
||||||
|
extraneous: extraneous,
|
||||||
|
starting: starting,
|
||||||
|
stopping: stopping,
|
||||||
|
deleting: deleting,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
// Bail early to avoid scheduling new prebuilds while operations are in progress.
|
||||||
|
if (toCreate+toDelete) > 0 && (starting+stopping+deleting) > 0 {
|
||||||
|
// TODO: move up
|
||||||
|
//c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation",
|
||||||
|
// slog.F("template_id", p.preset.TemplateID.String()), slog.F("starting", starting),
|
||||||
|
// slog.F("stopping", stopping), slog.F("deleting", deleting),
|
||||||
|
// slog.F("wanted_to_create", create), slog.F("wanted_to_delete", toDelete))
|
||||||
|
return actions, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
|
||||||
|
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
|
||||||
|
if extraneous > 0 {
|
||||||
|
// Sort running IDs by creation time so we always delete the oldest prebuilds.
|
||||||
|
// In general, we want fresher prebuilds (imagine a mono-repo is cloned; newer is better).
|
||||||
|
slices.SortFunc(p.running, func(a, b database.GetRunningPrebuildsRow) int {
|
||||||
|
if a.CreatedAt.Before(b.CreatedAt) {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
if a.CreatedAt.After(b.CreatedAt) {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
})
|
||||||
|
|
||||||
|
var victims []uuid.UUID
|
||||||
|
for i := 0; i < int(extraneous); i++ {
|
||||||
|
if i >= len(p.running) {
|
||||||
|
// This should never happen.
|
||||||
|
// TODO: move up
|
||||||
|
//c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!",
|
||||||
|
// slog.F("running_count", len(p.running)),
|
||||||
|
// slog.F("extraneous", extraneous))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
victims = append(victims, p.running[i].WorkspaceID)
|
||||||
|
}
|
||||||
|
|
||||||
|
actions.deleteIDs = append(actions.deleteIDs, victims...)
|
||||||
|
|
||||||
|
// TODO: move up
|
||||||
|
//c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)",
|
||||||
|
// slog.F("template_id", p.preset.TemplateID.String()), slog.F("desired", desired), slog.F("actual", actual), slog.F("extra", extraneous),
|
||||||
|
// slog.F("victims", victims))
|
||||||
|
|
||||||
|
// Prevent the rest of the reconciliation from completing
|
||||||
|
return actions, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the template has become deleted or deprecated since the last reconciliation, we need to ensure we
|
||||||
|
// scale those prebuilds down to zero.
|
||||||
|
if p.preset.Deleted || p.preset.Deprecated {
|
||||||
|
toCreate = 0
|
||||||
|
toDelete = int(actual + outdated)
|
||||||
|
}
|
||||||
|
|
||||||
|
actions.create = int32(toCreate)
|
||||||
|
|
||||||
|
if toDelete > 0 && len(p.running) != toDelete {
|
||||||
|
// TODO: move up
|
||||||
|
//c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!",
|
||||||
|
// slog.F("template_id", s.preset.TemplateID.String()), slog.F("running", len(p.running)), slog.F("to_delete", toDelete))
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: implement lookup to not perform same action on workspace multiple times in $period
|
||||||
|
// i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion
|
||||||
|
for i := 0; i < toDelete; i++ {
|
||||||
|
if i >= len(p.running) {
|
||||||
|
// TODO: move up
|
||||||
|
// Above warning will have already addressed this.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
actions.deleteIDs = append(actions.deleteIDs, p.running[i].WorkspaceID)
|
||||||
|
}
|
||||||
|
|
||||||
|
return actions, nil
|
||||||
|
}
|
Reference in New Issue
Block a user