Discrimination between "outdated" and "extraneous" prebuilds, hardening reconciliation

Signed-off-by: Danny Kopping <danny@coder.com>
This commit is contained in:
Danny Kopping
2025-01-29 11:32:36 +00:00
parent fdabb8cf07
commit eebbeb54bc
3 changed files with 212 additions and 124 deletions

View File

@ -5420,68 +5420,76 @@ func (q *sqlQuerier) ClaimPrebuild(ctx context.Context, newOwnerID uuid.UUID) (u
const getTemplatePrebuildState = `-- name: GetTemplatePrebuildState :many
WITH
-- All prebuilds currently running
running_prebuilds AS (SELECT p.template_id,
b.template_version_id,
COUNT(*) AS count,
STRING_AGG(p.id::text, ',') AS ids
FROM workspace_prebuilds p
INNER JOIN workspace_latest_build b ON b.workspace_id = p.id
INNER JOIN provisioner_jobs pj ON b.job_id = pj.id
INNER JOIN templates t ON p.template_id = t.id
WHERE (b.transition = 'start'::workspace_transition
-- if a deletion job fails, the workspace will still be running
OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status,
'unknown'::provisioner_job_status))
GROUP BY p.template_id, b.template_version_id),
-- All templates which have been configured for prebuilds (any version)
templates_with_prebuilds AS (SELECT t.id AS template_id,
tv.id AS template_version_id,
tv.id = t.active_version_id AS using_active_version,
tvpp.desired_instances,
t.deleted,
t.deprecated != '' AS deprecated
FROM templates t
INNER JOIN template_versions tv ON tv.template_id = t.id
INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id
INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
WHERE t.id = $1::uuid
GROUP BY t.id, tv.id, tvpp.id),
-- Jobs relating to prebuilds current in-flight
prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
FROM workspace_latest_build wlb
INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
WHERE pj.job_status NOT IN
('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status,
'failed'::provisioner_job_status)
GROUP BY wpb.template_version_id, wpb.transition)
-- All prebuilds currently running
running_prebuilds AS (SELECT p.template_id,
b.template_version_id,
COUNT(*) AS count,
STRING_AGG(p.id::text, ',') AS ids
FROM workspace_prebuilds p
INNER JOIN workspace_latest_build b ON b.workspace_id = p.id
INNER JOIN provisioner_jobs pj ON b.job_id = pj.id
INNER JOIN templates t ON p.template_id = t.id
WHERE (b.transition = 'start'::workspace_transition
-- if a deletion job fails, the workspace will still be running
OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status,
'unknown'::provisioner_job_status))
GROUP BY p.template_id, b.template_version_id),
-- All templates which have been configured for prebuilds (any version)
templates_with_prebuilds AS (SELECT t.id AS template_id,
tv.id AS template_version_id,
tv.id = t.active_version_id AS using_active_version,
tvpp.desired_instances,
t.deleted,
t.deprecated != '' AS deprecated
FROM templates t
INNER JOIN template_versions tv ON tv.template_id = t.id
INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id
INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
WHERE t.id = $1::uuid
GROUP BY t.id, tv.id, tvpp.id),
-- Jobs relating to prebuilds current in-flight
prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
FROM workspace_latest_build wlb
INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
WHERE pj.job_status NOT IN
('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status,
'failed'::provisioner_job_status)
GROUP BY wpb.template_version_id, wpb.transition)
SELECT t.template_id,
t.template_version_id,
t.using_active_version AS is_active,
COALESCE(MAX(CASE WHEN p.template_version_id = t.template_version_id THEN p.ids END),
'')::text AS running_prebuild_ids,
COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END),
0)::int AS actual, -- running prebuilds for active version
COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END),
0)::int AS desired, -- we only care about the active version's desired instances
COALESCE(MAX(CASE
WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false
THEN p.count END),
0)::int AS extraneous, -- running prebuilds for inactive version
COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS starting,
COALESCE(MAX(CASE WHEN pip.transition = 'stop'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS deleting,
t.deleted AS template_deleted,
t.deprecated AS template_deprecated
t.template_version_id,
t.using_active_version AS is_active,
MAX(CASE
WHEN p.template_version_id = t.template_version_id THEN p.ids
ELSE '' END)::text AS running_prebuild_ids,
COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END),
0)::int AS actual, -- running prebuilds for active version
MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)::int AS desired, -- we only care about the active version's desired instances
COALESCE(MAX(CASE
WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false
THEN p.count
ELSE 0 END),
0)::int AS outdated, -- running prebuilds for inactive version
COALESCE(GREATEST(
(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END)::int
- MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)),
0),
0) ::int AS extraneous, -- extra running prebuilds for active version
COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS starting,
COALESCE(MAX(CASE
WHEN pip.transition = 'stop'::workspace_transition THEN pip.count
ELSE 0 END),
0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS deleting,
t.deleted AS template_deleted,
t.deprecated AS template_deprecated
FROM templates_with_prebuilds t
LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id
LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id
LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id
LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id
GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids,
p.template_version_id, t.deleted, t.deprecated
p.template_version_id, t.deleted, t.deprecated
`
type GetTemplatePrebuildStateRow struct {
@ -5491,6 +5499,7 @@ type GetTemplatePrebuildStateRow struct {
RunningPrebuildIds string `db:"running_prebuild_ids" json:"running_prebuild_ids"`
Actual int32 `db:"actual" json:"actual"`
Desired int32 `db:"desired" json:"desired"`
Outdated int32 `db:"outdated" json:"outdated"`
Extraneous int32 `db:"extraneous" json:"extraneous"`
Starting int32 `db:"starting" json:"starting"`
Stopping int32 `db:"stopping" json:"stopping"`
@ -5515,6 +5524,7 @@ func (q *sqlQuerier) GetTemplatePrebuildState(ctx context.Context, templateID uu
&i.RunningPrebuildIds,
&i.Actual,
&i.Desired,
&i.Outdated,
&i.Extraneous,
&i.Starting,
&i.Stopping,

View File

@ -1,67 +1,75 @@
-- name: GetTemplatePrebuildState :many
WITH
-- All prebuilds currently running
running_prebuilds AS (SELECT p.template_id,
b.template_version_id,
COUNT(*) AS count,
STRING_AGG(p.id::text, ',') AS ids
FROM workspace_prebuilds p
INNER JOIN workspace_latest_build b ON b.workspace_id = p.id
INNER JOIN provisioner_jobs pj ON b.job_id = pj.id
INNER JOIN templates t ON p.template_id = t.id
WHERE (b.transition = 'start'::workspace_transition
-- if a deletion job fails, the workspace will still be running
OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status,
'unknown'::provisioner_job_status))
GROUP BY p.template_id, b.template_version_id),
-- All templates which have been configured for prebuilds (any version)
templates_with_prebuilds AS (SELECT t.id AS template_id,
tv.id AS template_version_id,
tv.id = t.active_version_id AS using_active_version,
tvpp.desired_instances,
t.deleted,
t.deprecated != '' AS deprecated
FROM templates t
INNER JOIN template_versions tv ON tv.template_id = t.id
INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id
INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
WHERE t.id = @template_id::uuid
GROUP BY t.id, tv.id, tvpp.id),
-- Jobs relating to prebuilds current in-flight
prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
FROM workspace_latest_build wlb
INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
WHERE pj.job_status NOT IN
('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status,
'failed'::provisioner_job_status)
GROUP BY wpb.template_version_id, wpb.transition)
-- All prebuilds currently running
running_prebuilds AS (SELECT p.template_id,
b.template_version_id,
COUNT(*) AS count,
STRING_AGG(p.id::text, ',') AS ids
FROM workspace_prebuilds p
INNER JOIN workspace_latest_build b ON b.workspace_id = p.id
INNER JOIN provisioner_jobs pj ON b.job_id = pj.id
INNER JOIN templates t ON p.template_id = t.id
WHERE (b.transition = 'start'::workspace_transition
-- if a deletion job fails, the workspace will still be running
OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status,
'unknown'::provisioner_job_status))
GROUP BY p.template_id, b.template_version_id),
-- All templates which have been configured for prebuilds (any version)
templates_with_prebuilds AS (SELECT t.id AS template_id,
tv.id AS template_version_id,
tv.id = t.active_version_id AS using_active_version,
tvpp.desired_instances,
t.deleted,
t.deprecated != '' AS deprecated
FROM templates t
INNER JOIN template_versions tv ON tv.template_id = t.id
INNER JOIN template_version_presets tvp ON tvp.template_version_id = tv.id
INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
WHERE t.id = @template_id::uuid
GROUP BY t.id, tv.id, tvpp.id),
-- Jobs relating to prebuilds current in-flight
prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
FROM workspace_latest_build wlb
INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
WHERE pj.job_status NOT IN
('succeeded'::provisioner_job_status, 'canceled'::provisioner_job_status,
'failed'::provisioner_job_status)
GROUP BY wpb.template_version_id, wpb.transition)
SELECT t.template_id,
t.template_version_id,
t.using_active_version AS is_active,
COALESCE(MAX(CASE WHEN p.template_version_id = t.template_version_id THEN p.ids END),
'')::text AS running_prebuild_ids,
COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END),
0)::int AS actual, -- running prebuilds for active version
COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END),
0)::int AS desired, -- we only care about the active version's desired instances
COALESCE(MAX(CASE
WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false
THEN p.count END),
0)::int AS extraneous, -- running prebuilds for inactive version
COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS starting,
COALESCE(MAX(CASE WHEN pip.transition = 'stop'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS deleting,
t.deleted AS template_deleted,
t.deprecated AS template_deprecated
t.template_version_id,
t.using_active_version AS is_active,
MAX(CASE
WHEN p.template_version_id = t.template_version_id THEN p.ids
ELSE '' END)::text AS running_prebuild_ids,
COALESCE(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END),
0)::int AS actual, -- running prebuilds for active version
MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)::int AS desired, -- we only care about the active version's desired instances
COALESCE(MAX(CASE
WHEN p.template_version_id = t.template_version_id AND t.using_active_version = false
THEN p.count
ELSE 0 END),
0)::int AS outdated, -- running prebuilds for inactive version
COALESCE(GREATEST(
(MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END)::int
- MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END)),
0),
0) ::int AS extraneous, -- extra running prebuilds for active version
COALESCE(MAX(CASE WHEN pip.transition = 'start'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS starting,
COALESCE(MAX(CASE
WHEN pip.transition = 'stop'::workspace_transition THEN pip.count
ELSE 0 END),
0)::int AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
COALESCE(MAX(CASE WHEN pip.transition = 'delete'::workspace_transition THEN pip.count ELSE 0 END),
0)::int AS deleting,
t.deleted AS template_deleted,
t.deprecated AS template_deprecated
FROM templates_with_prebuilds t
LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id
LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id
LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id
LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id
GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids,
p.template_version_id, t.deleted, t.deprecated;
p.template_version_id, t.deleted, t.deprecated;
-- name: ClaimPrebuild :one
UPDATE workspaces w

View File

@ -12,7 +12,9 @@ import (
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
"github.com/coder/coder/v2/coderd/wsbuilder"
"golang.org/x/exp/slices"
"math"
mrand "math/rand"
"strings"
"time"
@ -161,23 +163,90 @@ type reconciliationActions struct {
// calculateActions MUST be called within the context of a transaction (TODO: isolation)
// with an advisory lock to prevent TOCTOU races.
func (c Controller) calculateActions(ctx context.Context, template database.Template, state database.GetTemplatePrebuildStateRow) (*reconciliationActions, error) {
toCreate := int(math.Max(0, float64(state.Desired-(state.Actual+state.Starting))))
toDelete := int(math.Max(0, float64(state.Extraneous-state.Deleting-state.Stopping)))
// TODO: align workspace states with how we represent them on the FE and the CLI
// right now there's some slight differences which can lead to additional prebuilds being created
actions := &reconciliationActions{meta: state}
// TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is
// about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races!
var (
toCreate = int(math.Max(0, float64(
state.Desired- // The number specified in the preset
(state.Actual+state.Starting)- // The current number of prebuilds (or builds in-flight)
state.Stopping), // The number of prebuilds currently being stopped (should be 0)
))
toDelete = int(math.Max(0, float64(
state.Outdated- // The number of prebuilds running above the desired count for active version
state.Deleting), // The number of prebuilds currently being deleted
))
actions = &reconciliationActions{meta: state}
runningIDs = strings.Split(state.RunningPrebuildIds, ",")
)
// Bail early to avoid scheduling new prebuilds while operations are in progress.
if (toCreate+toDelete) > 0 && (state.Starting+state.Stopping+state.Deleting) > 0 {
c.logger.Warn(ctx, "prebuild operations in progress, skipping reconciliation",
slog.F("template_id", template.ID), slog.F("starting", state.Starting),
slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting),
slog.F("wanted_to_create", toCreate), slog.F("wanted_to_delete", toDelete))
return actions, nil
}
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
if len(runningIDs) > 0 && state.Extraneous > 0 {
// Sort running IDs randomly so we can pick random victims.
slices.SortFunc(runningIDs, func(_, _ string) int {
if mrand.Float64() > 0.5 {
return -1
}
return 1
})
var victims []uuid.UUID
for i := 0; i < int(state.Extraneous); i++ {
if i >= len(runningIDs) {
// This should never happen.
c.logger.Warn(ctx, "unexpected reconciliation state; extraneous count exceeds running prebuilds count!",
slog.F("running_count", len(runningIDs)),
slog.F("extraneous", state.Extraneous))
continue
}
victim := runningIDs[i]
id, err := uuid.Parse(victim)
if err != nil {
c.logger.Warn(ctx, "invalid prebuild ID", slog.F("template_id", template.ID),
slog.F("id", string(victim)), slog.Error(err))
} else {
victims = append(victims, id)
}
}
actions.deleteIDs = append(actions.deleteIDs, victims...)
c.logger.Warn(ctx, "found extra prebuilds running, picking random victim(s)",
slog.F("template_id", template.ID), slog.F("desired", state.Desired), slog.F("actual", state.Actual), slog.F("extra", state.Extraneous),
slog.F("victims", victims))
// Prevent the rest of the reconciliation from completing
return actions, nil
}
// If the template has become deleted or deprecated since the last reconciliation, we need to ensure we
// scale those prebuilds down to zero.
if state.TemplateDeleted || state.TemplateDeprecated {
toCreate = 0
toDelete = int(state.Actual + state.Extraneous)
toDelete = int(state.Actual + state.Outdated)
}
for i := 0; i < toCreate; i++ {
actions.createIDs = append(actions.createIDs, uuid.New())
}
runningIDs := strings.Split(state.RunningPrebuildIds, ",")
if toDelete > 0 && len(runningIDs) != toDelete {
c.logger.Warn(ctx, "mismatch between running prebuilds and expected deletion count!",
slog.F("template_id", template.ID), slog.F("running", len(runningIDs)), slog.F("to_delete", toDelete))
@ -248,7 +317,8 @@ func (c Controller) reconcileTemplate(ctx context.Context, template database.Tem
}
levelFn(innerCtx, "template prebuild state retrieved",
slog.F("to_create", len(actions.createIDs)), slog.F("to_delete", len(actions.deleteIDs)),
slog.F("desired", actions.meta.Desired), slog.F("actual", actions.meta.Actual), slog.F("extraneous", actions.meta.Extraneous),
slog.F("desired", actions.meta.Desired), slog.F("actual", actions.meta.Actual),
slog.F("outdated", actions.meta.Outdated), slog.F("extraneous", actions.meta.Extraneous),
slog.F("starting", actions.meta.Starting), slog.F("stopping", actions.meta.Stopping), slog.F("deleting", actions.meta.Deleting))
// Provision workspaces within the same tx so we don't get any timing issues here.