diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index d9d69cb6f0..5ceaea3fd2 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -5420,68 +5420,76 @@ func (q *sqlQuerier) ClaimPrebuild(ctx context.Context, newOwnerID uuid.UUID) (u const getTemplatePrebuildState = `-- name: GetTemplatePrebuildState :many WITH - -- All prebuilds currently running - running_prebuilds AS (SELECT p.template_id, - b.template_version_id, - COUNT(*) AS count, - STRING_AGG(p.id::text, ',') AS ids - FROM workspace_prebuilds p - INNER JOIN workspace_latest_build b ON b.workspace_id = p.id - INNER JOIN provisioner_jobs pj ON b.job_id = pj.id - INNER JOIN templates t ON p.template_id = t.id - WHERE (b.transition = 'start'::workspace_transition - -- if a deletion job fails, the workspace will still be running - OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status, - 'unknown'::provisioner_job_status)) - GROUP BY p.template_id, b.template_version_id), - -- All templates which have been configured for prebuilds (any version) - templates_with_prebuilds AS (SELECT t.id AS template_id, - tv.id AS template_version_id, - tv.id = t.active_version_id AS using_active_version, - tvpp.desired_instances, - t.deleted, - t.deprecated != '' AS deprecated - FROM templates t - INNER JOIN template_versions tv ON tv.template_id = t.id - INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id - INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id - WHERE t.id = $1::uuid - GROUP BY t.id, tv.id, tvpp.id), - -- Jobs relating to prebuilds current in-flight - prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count - FROM workspace_latest_build wlb - INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id - INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id - WHERE pj.job_status NOT IN - ('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status, - 'failed'::provisioner_job_status) - GROUP BY wpb.template_version_id, wpb.transition) + -- All prebuilds currently running + running_prebuilds AS (SELECT p.template_id, + b.template_version_id, + COUNT(*) AS count, + STRING_AGG(p.id::text, ',') AS ids + FROM workspace_prebuilds p + INNER JOIN workspace_latest_build b ON b.workspace_id = p.id + INNER JOIN provisioner_jobs pj ON b.job_id = pj.id + INNER JOIN templates t ON p.template_id = t.id + WHERE (b.transition = 'start'::workspace_transition + -- if a deletion job fails, the workspace will still be running + OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status, + 'unknown'::provisioner_job_status)) + GROUP BY p.template_id, b.template_version_id), + -- All templates which have been configured for prebuilds (any version) + templates_with_prebuilds AS (SELECT t.id AS template_id, + tv.id AS template_version_id, + tv.id = t.active_version_id AS using_active_version, + tvpp.desired_instances, + t.deleted, + t.deprecated != '' AS deprecated + FROM templates t + INNER JOIN template_versions tv ON tv.template_id = t.id + INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id + INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id + WHERE t.id = $1::uuid + GROUP BY t.id, tv.id, tvpp.id), + -- Jobs relating to prebuilds current in-flight + prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count + FROM workspace_latest_build wlb + INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id + INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id + WHERE pj.job_status NOT IN + ('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status, + 'failed'::provisioner_job_status) + GROUP BY wpb.template_version_id, wpb.transition) SELECT t.template_id, - t.template_version_id, - t.using_active_version AS is_active, - COALESCE(MAX(CASE WHEN p.template_version_id = t.template_version_id THEN p.ids END), - '')::text AS running_prebuild_ids, - COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END), - 0)::int AS actual, -- running prebuilds for active version - COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END), - 0)::int AS desired, -- we only care about the active version's desired instances - COALESCE(MAX(CASE - WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false - THEN p.count END), - 0)::int AS extraneous, -- running prebuilds for inactive version - COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END), - 0)::int AS starting, - COALESCE(MAX(CASE WHEN pip.transition = 'stop'::workspace_transition THEN pip.count ELSE 0 END), - 0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know - COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END), - 0)::int AS deleting, - t.deleted AS template_deleted, - t.deprecated AS template_deprecated + t.template_version_id, + t.using_active_version AS is_active, + MAX(CASE + WHEN p.template_version_id = t.template_version_id THEN p.ids + ELSE '' END)::text AS running_prebuild_ids, + COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END), + 0)::int AS actual, -- running prebuilds for active version + MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)::int AS desired, -- we only care about the active version's desired instances + COALESCE(MAX(CASE + WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false + THEN p.count + ELSE 0 END), + 0)::int AS outdated, -- running prebuilds for inactive version + COALESCE(GREATEST( + (MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END)::int + - MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)), + 0), + 0) ::int AS extraneous, -- extra running prebuilds for active version + COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END), + 0)::int AS starting, + COALESCE(MAX(CASE + WHEN pip.transition = 'stop'::workspace_transition THEN pip.count + ELSE 0 END), + 0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know + COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END), + 0)::int AS deleting, + t.deleted AS template_deleted, + t.deprecated AS template_deprecated FROM templates_with_prebuilds t - LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id - LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id + LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id + LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids, - p.template_version_id, t.deleted, t.deprecated + p.template_version_id, t.deleted, t.deprecated ` type GetTemplatePrebuildStateRow struct { @@ -5491,6 +5499,7 @@ type GetTemplatePrebuildStateRow struct { RunningPrebuildIds string `db:"running_prebuild_ids" json:"running_prebuild_ids"` Actual int32 `db:"actual" json:"actual"` Desired int32 `db:"desired" json:"desired"` + Outdated int32 `db:"outdated" json:"outdated"` Extraneous int32 `db:"extraneous" json:"extraneous"` Starting int32 `db:"starting" json:"starting"` Stopping int32 `db:"stopping" json:"stopping"` @@ -5515,6 +5524,7 @@ func (q *sqlQuerier) GetTemplatePrebuildState(ctx context.Context, templateID uu &i.RunningPrebuildIds, &i.Actual, &i.Desired, + &i.Outdated, &i.Extraneous, &i.Starting, &i.Stopping, diff --git a/coderd/database/queries/prebuilds.sql b/coderd/database/queries/prebuilds.sql index ca06ccb3c7..6a645b60de 100644 --- a/coderd/database/queries/prebuilds.sql +++ b/coderd/database/queries/prebuilds.sql @@ -1,67 +1,75 @@ -- name: GetTemplatePrebuildState :many WITH - -- All prebuilds currently running - running_prebuilds AS (SELECT p.template_id, - b.template_version_id, - COUNT(*) AS count, - STRING_AGG(p.id::text, ',') AS ids - FROM workspace_prebuilds p - INNER JOIN workspace_latest_build b ON b.workspace_id = p.id - INNER JOIN provisioner_jobs pj ON b.job_id = pj.id - INNER JOIN templates t ON p.template_id = t.id - WHERE (b.transition = 'start'::workspace_transition - -- if a deletion job fails, the workspace will still be running - OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status, - 'unknown'::provisioner_job_status)) - GROUP BY p.template_id, b.template_version_id), - -- All templates which have been configured for prebuilds (any version) - templates_with_prebuilds AS (SELECT t.id AS template_id, - tv.id AS template_version_id, - tv.id = t.active_version_id AS using_active_version, - tvpp.desired_instances, - t.deleted, - t.deprecated != '' AS deprecated - FROM templates t - INNER JOIN template_versions tv ON tv.template_id = t.id - INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id - INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id - WHERE t.id = @template_id::uuid - GROUP BY t.id, tv.id, tvpp.id), - -- Jobs relating to prebuilds current in-flight - prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count - FROM workspace_latest_build wlb - INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id - INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id - WHERE pj.job_status NOT IN - ('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status, - 'failed'::provisioner_job_status) - GROUP BY wpb.template_version_id, wpb.transition) + -- All prebuilds currently running + running_prebuilds AS (SELECT p.template_id, + b.template_version_id, + COUNT(*) AS count, + STRING_AGG(p.id::text, ',') AS ids + FROM workspace_prebuilds p + INNER JOIN workspace_latest_build b ON b.workspace_id = p.id + INNER JOIN provisioner_jobs pj ON b.job_id = pj.id + INNER JOIN templates t ON p.template_id = t.id + WHERE (b.transition = 'start'::workspace_transition + -- if a deletion job fails, the workspace will still be running + OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status, + 'unknown'::provisioner_job_status)) + GROUP BY p.template_id, b.template_version_id), + -- All templates which have been configured for prebuilds (any version) + templates_with_prebuilds AS (SELECT t.id AS template_id, + tv.id AS template_version_id, + tv.id = t.active_version_id AS using_active_version, + tvpp.desired_instances, + t.deleted, + t.deprecated != '' AS deprecated + FROM templates t + INNER JOIN template_versions tv ON tv.template_id = t.id + INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id + INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id + WHERE t.id = @template_id::uuid + GROUP BY t.id, tv.id, tvpp.id), + -- Jobs relating to prebuilds current in-flight + prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count + FROM workspace_latest_build wlb + INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id + INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id + WHERE pj.job_status NOT IN + ('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status, + 'failed'::provisioner_job_status) + GROUP BY wpb.template_version_id, wpb.transition) SELECT t.template_id, - t.template_version_id, - t.using_active_version AS is_active, - COALESCE(MAX(CASE WHEN p.template_version_id = t.template_version_id THEN p.ids END), - '')::text AS running_prebuild_ids, - COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END), - 0)::int AS actual, -- running prebuilds for active version - COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END), - 0)::int AS desired, -- we only care about the active version's desired instances - COALESCE(MAX(CASE - WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false - THEN p.count END), - 0)::int AS extraneous, -- running prebuilds for inactive version - COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END), - 0)::int AS starting, - COALESCE(MAX(CASE WHEN pip.transition = 'stop'::workspace_transition THEN pip.count ELSE 0 END), - 0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know - COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END), - 0)::int AS deleting, - t.deleted AS template_deleted, - t.deprecated AS template_deprecated + t.template_version_id, + t.using_active_version AS is_active, + MAX(CASE + WHEN p.template_version_id = t.template_version_id THEN p.ids + ELSE '' END)::text AS running_prebuild_ids, + COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END), + 0)::int AS actual, -- running prebuilds for active version + MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)::int AS desired, -- we only care about the active version's desired instances + COALESCE(MAX(CASE + WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false + THEN p.count + ELSE 0 END), + 0)::int AS outdated, -- running prebuilds for inactive version + COALESCE(GREATEST( + (MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END)::int + - MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)), + 0), + 0) ::int AS extraneous, -- extra running prebuilds for active version + COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END), + 0)::int AS starting, + COALESCE(MAX(CASE + WHEN pip.transition = 'stop'::workspace_transition THEN pip.count + ELSE 0 END), + 0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know + COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END), + 0)::int AS deleting, + t.deleted AS template_deleted, + t.deprecated AS template_deprecated FROM templates_with_prebuilds t - LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id - LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id + LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id + LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids, - p.template_version_id, t.deleted, t.deprecated; + p.template_version_id, t.deleted, t.deprecated; -- name: ClaimPrebuild :one UPDATE workspaces w diff --git a/coderd/prebuilds/controller.go b/coderd/prebuilds/controller.go index eafdd21f58..04ed77b47c 100644 --- a/coderd/prebuilds/controller.go +++ b/coderd/prebuilds/controller.go @@ -12,7 +12,9 @@ import ( "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" "github.com/coder/coder/v2/coderd/wsbuilder" + "golang.org/x/exp/slices" "math" + mrand "math/rand" "strings" "time" @@ -161,23 +163,90 @@ type reconciliationActions struct { // calculateActions MUST be called within the context of a transaction (TODO: isolation) // with an advisory lock to prevent TOCTOU races. func (c Controller) calculateActions(ctx context.Context, template database.Template, state database.GetTemplatePrebuildStateRow) (*reconciliationActions, error) { - toCreate := int(math.Max(0, float64(state.Desired-(state.Actual+state.Starting)))) - toDelete := int(math.Max(0, float64(state.Extraneous-state.Deleting-state.Stopping))) + // TODO: align workspace states with how we represent them on the FE and the CLI + // right now there's some slight differences which can lead to additional prebuilds being created - actions := &reconciliationActions{meta: state} + // TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is + // about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races! + + var ( + toCreate = int(math.Max(0, float64( + state.Desired- // The number specified in the preset + (state.Actual+state.Starting)- // The current number of prebuilds (or builds in-flight) + state.Stopping), // The number of prebuilds currently being stopped (should be 0) + )) + toDelete = int(math.Max(0, float64( + state.Outdated- // The number of prebuilds running above the desired count for active version + state.Deleting), // The number of prebuilds currently being deleted + )) + + actions = &reconciliationActions{meta: state} + runningIDs = strings.Split(state.RunningPrebuildIds, ",") + ) + + // Bail early to avoid scheduling new prebuilds while operations are in progress. + if (toCreate+toDelete) > 0 && (state.Starting+state.Stopping+state.Deleting) > 0 { + c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation", + slog.F("template_id", template.ID), slog.F("starting", state.Starting), + slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting), + slog.F("wanted_to_create", toCreate), slog.F("wanted_to_delete", toDelete)) + return actions, nil + } + + // It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so + // we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted. + if len(runningIDs) > 0 && state.Extraneous > 0 { + // Sort running IDs randomly so we can pick random victims. + slices.SortFunc(runningIDs, func(_, _ string) int { + if mrand.Float64() > 0.5 { + return -1 + } + + return 1 + }) + + var victims []uuid.UUID + for i := 0; i < int(state.Extraneous); i++ { + if i >= len(runningIDs) { + // This should never happen. + c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!", + slog.F("running_count", len(runningIDs)), + slog.F("extraneous", state.Extraneous)) + continue + } + + victim := runningIDs[i] + + id, err := uuid.Parse(victim) + if err != nil { + c.logger.Warn(ctx, "invalid prebuild ID", slog.F("template_id", template.ID), + slog.F("id", string(victim)), slog.Error(err)) + } else { + victims = append(victims, id) + } + } + + actions.deleteIDs = append(actions.deleteIDs, victims...) + + c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)", + slog.F("template_id", template.ID), slog.F("desired", state.Desired), slog.F("actual", state.Actual), slog.F("extra", state.Extraneous), + slog.F("victims", victims)) + + // Prevent the rest of the reconciliation from completing + return actions, nil + } // If the template has become deleted or deprecated since the last reconciliation, we need to ensure we // scale those prebuilds down to zero. if state.TemplateDeleted || state.TemplateDeprecated { toCreate = 0 - toDelete = int(state.Actual + state.Extraneous) + toDelete = int(state.Actual + state.Outdated) } for i := 0; i < toCreate; i++ { actions.createIDs = append(actions.createIDs, uuid.New()) } - runningIDs := strings.Split(state.RunningPrebuildIds, ",") if toDelete > 0 && len(runningIDs) != toDelete { c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!", slog.F("template_id", template.ID), slog.F("running", len(runningIDs)), slog.F("to_delete", toDelete)) @@ -248,7 +317,8 @@ func (c Controller) reconcileTemplate(ctx context.Context, template database.Tem } levelFn(innerCtx, "template prebuild state retrieved", slog.F("to_create", len(actions.createIDs)), slog.F("to_delete", len(actions.deleteIDs)), - slog.F("desired", actions.meta.Desired), slog.F("actual", actions.meta.Actual), slog.F("extraneous", actions.meta.Extraneous), + slog.F("desired", actions.meta.Desired), slog.F("actual", actions.meta.Actual), + slog.F("outdated", actions.meta.Outdated), slog.F("extraneous", actions.meta.Extraneous), slog.F("starting", actions.meta.Starting), slog.F("stopping", actions.meta.Stopping), slog.F("deleting", actions.meta.Deleting)) // Provision workspaces within the same tx so we don't get any timing issues here.