mirror of
https://github.com/coder/coder.git
synced 2025-07-13 21:36:50 +00:00
Fixed bug in state query relating to multiple template versions & workspaces in partially-deleted statuses
Signed-off-by: Danny Kopping <danny@coder.com>
This commit is contained in:
@ -5398,10 +5398,17 @@ func (q *sqlQuerier) GetParameterSchemasByJobID(ctx context.Context, jobID uuid.
|
||||
const getTemplatePrebuildState = `-- name: GetTemplatePrebuildState :one
|
||||
WITH
|
||||
-- All prebuilds currently running
|
||||
running_prebuilds AS (SELECT p.template_id, b.template_version_id, COUNT(*) AS count, STRING_AGG(p.id::text, ',') AS ids
|
||||
running_prebuilds AS (SELECT p.template_id,
|
||||
b.template_version_id,
|
||||
COUNT(*) AS count,
|
||||
STRING_AGG(p.id::text, ',') AS ids
|
||||
FROM workspace_prebuilds p
|
||||
INNER JOIN workspace_latest_build b ON b.workspace_id = p.id
|
||||
WHERE b.transition = 'start'::workspace_transition
|
||||
INNER JOIN provisioner_jobs pj ON b.job_id = pj.id
|
||||
WHERE (b.transition = 'start'::workspace_transition
|
||||
-- if a deletion job fails, the workspace will still be running
|
||||
OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status,
|
||||
'unknown'::provisioner_job_status))
|
||||
GROUP BY p.template_id, b.template_version_id),
|
||||
-- All templates which have been configured for prebuilds (any version)
|
||||
templates_with_prebuilds AS (SELECT t.id AS template_id,
|
||||
@ -5416,6 +5423,7 @@ WITH
|
||||
INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
|
||||
WHERE t.id = $1::uuid
|
||||
GROUP BY t.id, tv.id, tvpp.id),
|
||||
-- Jobs relating to prebuilds current in-flight
|
||||
prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
|
||||
FROM workspace_prebuild_builds wpb
|
||||
INNER JOIN workspace_latest_build wlb ON wpb.workspace_id = wlb.workspace_id
|
||||
@ -5425,29 +5433,38 @@ WITH
|
||||
'failed'::provisioner_job_status)
|
||||
GROUP BY wpb.template_version_id, wpb.transition)
|
||||
SELECT t.template_id,
|
||||
p.ids AS running_prebuild_ids,
|
||||
CAST(SUM(CASE WHEN t.using_active_version THEN p.count ELSE 0 END) AS INT) AS actual, -- running prebuilds for active version
|
||||
CAST(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END) AS int) AS desired, -- we only care about the active version's desired instances
|
||||
CAST(SUM(CASE WHEN t.using_active_version THEN 0 ELSE p.count END) AS INT) AS extraneous, -- running prebuilds for inactive version
|
||||
CAST(MAX(CASE
|
||||
WHEN pip.transition = 'start'::workspace_transition THEN pip.count
|
||||
ELSE 0 END) AS INT) AS starting,
|
||||
CAST(MAX(CASE
|
||||
WHEN pip.transition = 'stop'::workspace_transition THEN pip.count
|
||||
ELSE 0 END) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
|
||||
CAST(MAX(CASE
|
||||
WHEN pip.transition = 'delete'::workspace_transition THEN pip.count
|
||||
ELSE 0 END) AS INT) AS deleting,
|
||||
t.deleted AS template_deleted,
|
||||
t.deprecated AS template_deprecated
|
||||
t.template_version_id,
|
||||
t.using_active_version AS is_active,
|
||||
p.ids AS running_prebuild_ids,
|
||||
CAST(COALESCE(
|
||||
MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END),
|
||||
0) AS INT) AS actual, -- running prebuilds for active version
|
||||
CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END),
|
||||
0) AS INT) AS desired, -- we only care about the active version's desired instances
|
||||
CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN 0 ELSE p.count END),
|
||||
0) AS INT) AS extraneous, -- running prebuilds for inactive version
|
||||
CAST(COALESCE(MAX(CASE
|
||||
WHEN pip.transition = 'start'::workspace_transition THEN pip.count
|
||||
ELSE 0 END), 0) AS INT) AS starting,
|
||||
CAST(COALESCE(MAX(CASE
|
||||
WHEN pip.transition = 'stop'::workspace_transition THEN pip.count
|
||||
ELSE 0 END),
|
||||
0) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
|
||||
CAST(COALESCE(MAX(CASE
|
||||
WHEN pip.transition = 'delete'::workspace_transition THEN pip.count
|
||||
ELSE 0 END), 0) AS INT) AS deleting,
|
||||
t.deleted AS template_deleted,
|
||||
t.deprecated AS template_deprecated
|
||||
FROM templates_with_prebuilds t
|
||||
LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id
|
||||
LEFT JOIN running_prebuilds p ON p.template_id = t.template_id
|
||||
LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id
|
||||
GROUP BY t.template_id, p.count, p.ids, t.deleted, t.deprecated
|
||||
GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids, t.deleted, t.deprecated
|
||||
`
|
||||
|
||||
type GetTemplatePrebuildStateRow struct {
|
||||
TemplateID uuid.UUID `db:"template_id" json:"template_id"`
|
||||
TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"`
|
||||
IsActive bool `db:"is_active" json:"is_active"`
|
||||
RunningPrebuildIds []byte `db:"running_prebuild_ids" json:"running_prebuild_ids"`
|
||||
Actual int32 `db:"actual" json:"actual"`
|
||||
Desired int32 `db:"desired" json:"desired"`
|
||||
@ -5464,6 +5481,8 @@ func (q *sqlQuerier) GetTemplatePrebuildState(ctx context.Context, templateID uu
|
||||
var i GetTemplatePrebuildStateRow
|
||||
err := row.Scan(
|
||||
&i.TemplateID,
|
||||
&i.TemplateVersionID,
|
||||
&i.IsActive,
|
||||
&i.RunningPrebuildIds,
|
||||
&i.Actual,
|
||||
&i.Desired,
|
||||
|
@ -1,10 +1,17 @@
|
||||
-- name: GetTemplatePrebuildState :one
|
||||
WITH
|
||||
-- All prebuilds currently running
|
||||
running_prebuilds AS (SELECT p.template_id, b.template_version_id, COUNT(*) AS count, STRING_AGG(p.id::text, ',') AS ids
|
||||
running_prebuilds AS (SELECT p.template_id,
|
||||
b.template_version_id,
|
||||
COUNT(*) AS count,
|
||||
STRING_AGG(p.id::text, ',') AS ids
|
||||
FROM workspace_prebuilds p
|
||||
INNER JOIN workspace_latest_build b ON b.workspace_id = p.id
|
||||
WHERE b.transition = 'start'::workspace_transition
|
||||
INNER JOIN provisioner_jobs pj ON b.job_id = pj.id
|
||||
WHERE (b.transition = 'start'::workspace_transition
|
||||
-- if a deletion job fails, the workspace will still be running
|
||||
OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status,
|
||||
'unknown'::provisioner_job_status))
|
||||
GROUP BY p.template_id, b.template_version_id),
|
||||
-- All templates which have been configured for prebuilds (any version)
|
||||
templates_with_prebuilds AS (SELECT t.id AS template_id,
|
||||
@ -19,6 +26,7 @@ WITH
|
||||
INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
|
||||
WHERE t.id = @template_id::uuid
|
||||
GROUP BY t.id, tv.id, tvpp.id),
|
||||
-- Jobs relating to prebuilds current in-flight
|
||||
prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
|
||||
FROM workspace_prebuild_builds wpb
|
||||
INNER JOIN workspace_latest_build wlb ON wpb.workspace_id = wlb.workspace_id
|
||||
@ -28,22 +36,29 @@ WITH
|
||||
'failed'::provisioner_job_status)
|
||||
GROUP BY wpb.template_version_id, wpb.transition)
|
||||
SELECT t.template_id,
|
||||
p.ids AS running_prebuild_ids,
|
||||
CAST(SUM(CASE WHEN t.using_active_version THEN p.count ELSE 0 END) AS INT) AS actual, -- running prebuilds for active version
|
||||
CAST(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END) AS int) AS desired, -- we only care about the active version's desired instances
|
||||
CAST(SUM(CASE WHEN t.using_active_version THEN 0 ELSE p.count END) AS INT) AS extraneous, -- running prebuilds for inactive version
|
||||
CAST(MAX(CASE
|
||||
WHEN pip.transition = 'start'::workspace_transition THEN pip.count
|
||||
ELSE 0 END) AS INT) AS starting,
|
||||
CAST(MAX(CASE
|
||||
WHEN pip.transition = 'stop'::workspace_transition THEN pip.count
|
||||
ELSE 0 END) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
|
||||
CAST(MAX(CASE
|
||||
WHEN pip.transition = 'delete'::workspace_transition THEN pip.count
|
||||
ELSE 0 END) AS INT) AS deleting,
|
||||
t.deleted AS template_deleted,
|
||||
t.deprecated AS template_deprecated
|
||||
t.template_version_id,
|
||||
t.using_active_version AS is_active,
|
||||
p.ids AS running_prebuild_ids,
|
||||
CAST(COALESCE(
|
||||
MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END),
|
||||
0) AS INT) AS actual, -- running prebuilds for active version
|
||||
CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END),
|
||||
0) AS INT) AS desired, -- we only care about the active version's desired instances
|
||||
CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN 0 ELSE p.count END),
|
||||
0) AS INT) AS extraneous, -- running prebuilds for inactive version
|
||||
CAST(COALESCE(MAX(CASE
|
||||
WHEN pip.transition = 'start'::workspace_transition THEN pip.count
|
||||
ELSE 0 END), 0) AS INT) AS starting,
|
||||
CAST(COALESCE(MAX(CASE
|
||||
WHEN pip.transition = 'stop'::workspace_transition THEN pip.count
|
||||
ELSE 0 END),
|
||||
0) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know
|
||||
CAST(COALESCE(MAX(CASE
|
||||
WHEN pip.transition = 'delete'::workspace_transition THEN pip.count
|
||||
ELSE 0 END), 0) AS INT) AS deleting,
|
||||
t.deleted AS template_deleted,
|
||||
t.deprecated AS template_deprecated
|
||||
FROM templates_with_prebuilds t
|
||||
LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id
|
||||
LEFT JOIN running_prebuilds p ON p.template_id = t.template_id
|
||||
LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id
|
||||
GROUP BY t.template_id, p.count, p.ids, t.deleted, t.deprecated;
|
||||
GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids, t.deleted, t.deprecated;
|
||||
|
@ -156,16 +156,9 @@ type reconciliationActions struct {
|
||||
meta database.GetTemplatePrebuildStateRow
|
||||
}
|
||||
|
||||
// calculeActions MUST be called within the context of a transaction (TODO: isolation)
|
||||
// calculateActions MUST be called within the context of a transaction (TODO: isolation)
|
||||
// with an advisory lock to prevent TOCTOU races.
|
||||
func (c Controller) calculeActions(ctx context.Context, db database.Store, template database.Template) (*reconciliationActions, error) {
|
||||
// TODO: change to "many" and return 1 or more rows, but only one should be returned
|
||||
// more than 1 response is indicative of a query problem
|
||||
state, err := db.GetTemplatePrebuildState(ctx, template.ID)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("failed to retrieve template's prebuild state: %w", err)
|
||||
}
|
||||
|
||||
func (c Controller) calculateActions(ctx context.Context, template database.Template, state database.GetTemplatePrebuildStateRow) (*reconciliationActions, error) {
|
||||
toCreate := int(math.Max(0, float64(state.Desired-(state.Actual+state.Starting))))
|
||||
toDelete := int(math.Max(0, float64(state.Extraneous-state.Deleting-state.Stopping)))
|
||||
|
||||
@ -188,6 +181,8 @@ func (c Controller) calculeActions(ctx context.Context, db database.Store, templ
|
||||
slog.F("template_id", template.ID), slog.F("running", len(runningIDs)), slog.F("to_destroy", toDelete))
|
||||
}
|
||||
|
||||
// TODO: implement lookup to not perform same action on workspace multiple times in $period
|
||||
// i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion
|
||||
for i := 0; i < toDelete; i++ {
|
||||
if i >= len(runningIDs) {
|
||||
// Above warning will have already addressed this.
|
||||
@ -222,35 +217,51 @@ func (c Controller) reconcileTemplate(ctx context.Context, template database.Tem
|
||||
innerCtx, cancel := context.WithTimeout(ctx, time.Second*30)
|
||||
defer cancel()
|
||||
|
||||
actions, err := c.calculeActions(innerCtx, db, template)
|
||||
// TODO: change to "many" and return 1 or more rows, but only one should be returned
|
||||
// more than 1 response is indicative of a query problem
|
||||
state, err := db.GetTemplatePrebuildState(ctx, template.ID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("failed to retrieve template's prebuild states: %w", err)
|
||||
}
|
||||
|
||||
logger.Info(innerCtx, "template prebuild state retrieved",
|
||||
actions, err := c.calculateActions(innerCtx, template, state)
|
||||
if err != nil {
|
||||
logger.Error(ctx, "failed to calculate reconciliation actions", slog.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: authz // Can't use existing profiles (i.e. AsSystemRestricted) because of dbauthz rules
|
||||
var ownerCtx = dbauthz.As(ctx, rbac.Subject{
|
||||
ID: "owner",
|
||||
Roles: rbac.RoleIdentifiers{rbac.RoleOwner()},
|
||||
Groups: []string{},
|
||||
Scope: rbac.ExpandableScope(rbac.ScopeAll),
|
||||
})
|
||||
|
||||
levelFn := logger.Debug
|
||||
if len(actions.createIDs) > 0 || len(actions.deleteIDs) > 0 {
|
||||
// Only log with info level when there's a change that needs to be effected.
|
||||
levelFn = logger.Info
|
||||
}
|
||||
levelFn(innerCtx, "template prebuild state retrieved",
|
||||
slog.F("to_create", len(actions.createIDs)), slog.F("to_destroy", len(actions.deleteIDs)),
|
||||
slog.F("desired", actions.meta.Desired), slog.F("actual", actions.meta.Actual), slog.F("extraneous", actions.meta.Extraneous),
|
||||
slog.F("starting", actions.meta.Starting), slog.F("stopping", actions.meta.Stopping), slog.F("deleting", actions.meta.Deleting))
|
||||
|
||||
// Provision workspaces within the same tx so we don't get any timing issues here.
|
||||
// i.e. we hold the advisory lock until all reconciliatory actions have been taken.
|
||||
// TODO: max per reconciliation iteration?
|
||||
|
||||
for _, id := range actions.createIDs {
|
||||
// Provision workspaces within the same tx so we don't get any timing issues here.
|
||||
// i.e. we hold the advisory lock until all reconciliatory actions have been taken.
|
||||
|
||||
// TODO: loop
|
||||
// TODO: max per reconciliation iteration?
|
||||
|
||||
// TODO: authz // Can't use existing profiles (i.e. AsSystemRestricted) because of dbauthz rules
|
||||
var ownerCtx = dbauthz.As(ctx, rbac.Subject{
|
||||
ID: "owner",
|
||||
Roles: rbac.RoleIdentifiers{rbac.RoleOwner()},
|
||||
Groups: []string{},
|
||||
Scope: rbac.ExpandableScope(rbac.ScopeAll),
|
||||
})
|
||||
|
||||
if err := c.provision(ownerCtx, template, id, db); err != nil {
|
||||
logger.Error(ctx, "failed to provision prebuild", slog.Error(err))
|
||||
if err := c.createPrebuild(ownerCtx, db, id, template); err != nil {
|
||||
logger.Error(ctx, "failed to create prebuild", slog.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
for _, id := range actions.deleteIDs {
|
||||
// TODO: actually delete
|
||||
logger.Info(ctx, "would've deleted prebuild", slog.F("workspace_id", id))
|
||||
if err := c.deletePrebuild(ownerCtx, db, id, template); err != nil {
|
||||
logger.Error(ctx, "failed to delete prebuild", slog.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -266,11 +277,7 @@ func (c Controller) reconcileTemplate(ctx context.Context, template database.Tem
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c Controller) provision(ctx context.Context, template database.Template, prebuildID uuid.UUID, db database.Store) error {
|
||||
var (
|
||||
provisionerJob *database.ProvisionerJob
|
||||
)
|
||||
|
||||
func (c Controller) createPrebuild(ctx context.Context, db database.Store, prebuildID uuid.UUID, template database.Template) error {
|
||||
name := fmt.Sprintf("prebuild-%s", prebuildID)
|
||||
|
||||
now := dbtime.Now()
|
||||
@ -296,14 +303,30 @@ func (c Controller) provision(ctx context.Context, template database.Template, p
|
||||
return xerrors.Errorf("get workspace by ID: %w", err)
|
||||
}
|
||||
|
||||
builder := wsbuilder.New(workspace, database.WorkspaceTransitionStart).
|
||||
c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name), slog.F("workspace_id", prebuildID.String()))
|
||||
|
||||
return c.provision(ctx, db, prebuildID, template, database.WorkspaceTransitionStart, workspace)
|
||||
}
|
||||
func (c Controller) deletePrebuild(ctx context.Context, db database.Store, prebuildID uuid.UUID, template database.Template) error {
|
||||
workspace, err := db.GetWorkspaceByID(ctx, prebuildID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get workspace by ID: %w", err)
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "attempting to delete prebuild", slog.F("workspace_id", prebuildID.String()))
|
||||
|
||||
return c.provision(ctx, db, prebuildID, template, database.WorkspaceTransitionDelete, workspace)
|
||||
}
|
||||
|
||||
func (c Controller) provision(ctx context.Context, db database.Store, prebuildID uuid.UUID, template database.Template, transition database.WorkspaceTransition, workspace database.Workspace) error {
|
||||
builder := wsbuilder.New(workspace, transition).
|
||||
Reason(database.BuildReasonInitiator).
|
||||
Initiator(PrebuildOwnerUUID).
|
||||
ActiveVersion().
|
||||
VersionID(template.ActiveVersionID)
|
||||
// RichParameterValues(req.RichParameterValues) // TODO: fetch preset's params
|
||||
|
||||
_, provisionerJob, _, err = builder.Build(
|
||||
_, provisionerJob, _, err := builder.Build(
|
||||
ctx,
|
||||
db,
|
||||
func(action policy.Action, object rbac.Objecter) bool {
|
||||
@ -321,8 +344,8 @@ func (c Controller) provision(ctx context.Context, template database.Template, p
|
||||
c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err))
|
||||
}
|
||||
|
||||
c.logger.Info(ctx, "provisioned new prebuild", slog.F("prebuild_id", prebuildID.String()),
|
||||
slog.F("job_id", provisionerJob.ID))
|
||||
c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition),
|
||||
slog.F("prebuild_id", prebuildID.String()), slog.F("job_id", provisionerJob.ID))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user