From b7c43f663e8ad59625ff5201bff74f01f1433388 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Tue, 28 Jan 2025 05:19:38 +0000 Subject: [PATCH] Fixed bug in state query relating to multiple template versions & workspaces in partially-deleted statuses Signed-off-by: Danny Kopping --- coderd/database/queries.sql.go | 57 ++++++++++----- coderd/database/queries/prebuilds.sql | 53 +++++++++----- coderd/prebuilds/controller.go | 99 +++++++++++++++++---------- 3 files changed, 133 insertions(+), 76 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 20433ef182..d21854cc27 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -5398,10 +5398,17 @@ func (q *sqlQuerier) GetParameterSchemasByJobID(ctx context.Context, jobID uuid. const getTemplatePrebuildState = `-- name: GetTemplatePrebuildState :one WITH -- All prebuilds currently running - running_prebuilds AS (SELECT p.template_id, b.template_version_id, COUNT(*) AS count, STRING_AGG(p.id::text, ',') AS ids + running_prebuilds AS (SELECT p.template_id, + b.template_version_id, + COUNT(*) AS count, + STRING_AGG(p.id::text, ',') AS ids FROM workspace_prebuilds p INNER JOIN workspace_latest_build b ON b.workspace_id = p.id - WHERE b.transition = 'start'::workspace_transition + INNER JOIN provisioner_jobs pj ON b.job_id = pj.id + WHERE (b.transition = 'start'::workspace_transition + -- if a deletion job fails, the workspace will still be running + OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status, + 'unknown'::provisioner_job_status)) GROUP BY p.template_id, b.template_version_id), -- All templates which have been configured for prebuilds (any version) templates_with_prebuilds AS (SELECT t.id AS template_id, @@ -5416,6 +5423,7 @@ WITH INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id WHERE t.id = $1::uuid GROUP BY t.id, tv.id, tvpp.id), + -- Jobs relating to prebuilds current in-flight prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count FROM workspace_prebuild_builds wpb INNER JOIN workspace_latest_build wlb ON wpb.workspace_id = wlb.workspace_id @@ -5425,29 +5433,38 @@ WITH 'failed'::provisioner_job_status) GROUP BY wpb.template_version_id, wpb.transition) SELECT t.template_id, - p.ids AS running_prebuild_ids, - CAST(SUM(CASE WHEN t.using_active_version THEN p.count ELSE 0 END) AS INT) AS actual, -- running prebuilds for active version - CAST(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END) AS int) AS desired, -- we only care about the active version's desired instances - CAST(SUM(CASE WHEN t.using_active_version THEN 0 ELSE p.count END) AS INT) AS extraneous, -- running prebuilds for inactive version - CAST(MAX(CASE - WHEN pip.transition = 'start'::workspace_transition THEN pip.count - ELSE 0 END) AS INT) AS starting, - CAST(MAX(CASE - WHEN pip.transition = 'stop'::workspace_transition THEN pip.count - ELSE 0 END) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know - CAST(MAX(CASE - WHEN pip.transition = 'delete'::workspace_transition THEN pip.count - ELSE 0 END) AS INT) AS deleting, - t.deleted AS template_deleted, - t.deprecated AS template_deprecated + t.template_version_id, + t.using_active_version AS is_active, + p.ids AS running_prebuild_ids, + CAST(COALESCE( + MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END), + 0) AS INT) AS actual, -- running prebuilds for active version + CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END), + 0) AS INT) AS desired, -- we only care about the active version's desired instances + CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN 0 ELSE p.count END), + 0) AS INT) AS extraneous, -- running prebuilds for inactive version + CAST(COALESCE(MAX(CASE + WHEN pip.transition = 'start'::workspace_transition THEN pip.count + ELSE 0 END), 0) AS INT) AS starting, + CAST(COALESCE(MAX(CASE + WHEN pip.transition = 'stop'::workspace_transition THEN pip.count + ELSE 0 END), + 0) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know + CAST(COALESCE(MAX(CASE + WHEN pip.transition = 'delete'::workspace_transition THEN pip.count + ELSE 0 END), 0) AS INT) AS deleting, + t.deleted AS template_deleted, + t.deprecated AS template_deprecated FROM templates_with_prebuilds t - LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id + LEFT JOIN running_prebuilds p ON p.template_id = t.template_id LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id -GROUP BY t.template_id, p.count, p.ids, t.deleted, t.deprecated +GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids, t.deleted, t.deprecated ` type GetTemplatePrebuildStateRow struct { TemplateID uuid.UUID `db:"template_id" json:"template_id"` + TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` + IsActive bool `db:"is_active" json:"is_active"` RunningPrebuildIds []byte `db:"running_prebuild_ids" json:"running_prebuild_ids"` Actual int32 `db:"actual" json:"actual"` Desired int32 `db:"desired" json:"desired"` @@ -5464,6 +5481,8 @@ func (q *sqlQuerier) GetTemplatePrebuildState(ctx context.Context, templateID uu var i GetTemplatePrebuildStateRow err := row.Scan( &i.TemplateID, + &i.TemplateVersionID, + &i.IsActive, &i.RunningPrebuildIds, &i.Actual, &i.Desired, diff --git a/coderd/database/queries/prebuilds.sql b/coderd/database/queries/prebuilds.sql index e11f8ab73a..dce2590354 100644 --- a/coderd/database/queries/prebuilds.sql +++ b/coderd/database/queries/prebuilds.sql @@ -1,10 +1,17 @@ -- name: GetTemplatePrebuildState :one WITH -- All prebuilds currently running - running_prebuilds AS (SELECT p.template_id, b.template_version_id, COUNT(*) AS count, STRING_AGG(p.id::text, ',') AS ids + running_prebuilds AS (SELECT p.template_id, + b.template_version_id, + COUNT(*) AS count, + STRING_AGG(p.id::text, ',') AS ids FROM workspace_prebuilds p INNER JOIN workspace_latest_build b ON b.workspace_id = p.id - WHERE b.transition = 'start'::workspace_transition + INNER JOIN provisioner_jobs pj ON b.job_id = pj.id + WHERE (b.transition = 'start'::workspace_transition + -- if a deletion job fails, the workspace will still be running + OR pj.job_status IN ('failed'::provisioner_job_status, 'canceled'::provisioner_job_status, + 'unknown'::provisioner_job_status)) GROUP BY p.template_id, b.template_version_id), -- All templates which have been configured for prebuilds (any version) templates_with_prebuilds AS (SELECT t.id AS template_id, @@ -19,6 +26,7 @@ WITH INNER JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id WHERE t.id = @template_id::uuid GROUP BY t.id, tv.id, tvpp.id), + -- Jobs relating to prebuilds current in-flight prebuilds_in_progress AS (SELECT wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count FROM workspace_prebuild_builds wpb INNER JOIN workspace_latest_build wlb ON wpb.workspace_id = wlb.workspace_id @@ -28,22 +36,29 @@ WITH 'failed'::provisioner_job_status) GROUP BY wpb.template_version_id, wpb.transition) SELECT t.template_id, - p.ids AS running_prebuild_ids, - CAST(SUM(CASE WHEN t.using_active_version THEN p.count ELSE 0 END) AS INT) AS actual, -- running prebuilds for active version - CAST(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END) AS int) AS desired, -- we only care about the active version's desired instances - CAST(SUM(CASE WHEN t.using_active_version THEN 0 ELSE p.count END) AS INT) AS extraneous, -- running prebuilds for inactive version - CAST(MAX(CASE - WHEN pip.transition = 'start'::workspace_transition THEN pip.count - ELSE 0 END) AS INT) AS starting, - CAST(MAX(CASE - WHEN pip.transition = 'stop'::workspace_transition THEN pip.count - ELSE 0 END) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know - CAST(MAX(CASE - WHEN pip.transition = 'delete'::workspace_transition THEN pip.count - ELSE 0 END) AS INT) AS deleting, - t.deleted AS template_deleted, - t.deprecated AS template_deprecated + t.template_version_id, + t.using_active_version AS is_active, + p.ids AS running_prebuild_ids, + CAST(COALESCE( + MAX(CASE WHEN t.using_active_version THEN p.count ELSE 0 END), + 0) AS INT) AS actual, -- running prebuilds for active version + CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN t.desired_instances ELSE 0 END), + 0) AS INT) AS desired, -- we only care about the active version's desired instances + CAST(COALESCE(MAX(CASE WHEN t.using_active_version THEN 0 ELSE p.count END), + 0) AS INT) AS extraneous, -- running prebuilds for inactive version + CAST(COALESCE(MAX(CASE + WHEN pip.transition = 'start'::workspace_transition THEN pip.count + ELSE 0 END), 0) AS INT) AS starting, + CAST(COALESCE(MAX(CASE + WHEN pip.transition = 'stop'::workspace_transition THEN pip.count + ELSE 0 END), + 0) AS INT) AS stopping, -- not strictly needed, since prebuilds should never be left if a "stopped" state, but useful to know + CAST(COALESCE(MAX(CASE + WHEN pip.transition = 'delete'::workspace_transition THEN pip.count + ELSE 0 END), 0) AS INT) AS deleting, + t.deleted AS template_deleted, + t.deprecated AS template_deprecated FROM templates_with_prebuilds t - LEFT JOIN running_prebuilds p ON p.template_version_id = t.template_version_id + LEFT JOIN running_prebuilds p ON p.template_id = t.template_id LEFT JOIN prebuilds_in_progress pip ON pip.template_version_id = t.template_version_id -GROUP BY t.template_id, p.count, p.ids, t.deleted, t.deprecated; +GROUP BY t.using_active_version, t.template_id, t.template_version_id, p.count, p.ids, t.deleted, t.deprecated; diff --git a/coderd/prebuilds/controller.go b/coderd/prebuilds/controller.go index 0b95206c07..2ad2137a58 100644 --- a/coderd/prebuilds/controller.go +++ b/coderd/prebuilds/controller.go @@ -156,16 +156,9 @@ type reconciliationActions struct { meta database.GetTemplatePrebuildStateRow } -// calculeActions MUST be called within the context of a transaction (TODO: isolation) +// calculateActions MUST be called within the context of a transaction (TODO: isolation) // with an advisory lock to prevent TOCTOU races. -func (c Controller) calculeActions(ctx context.Context, db database.Store, template database.Template) (*reconciliationActions, error) { - // TODO: change to "many" and return 1 or more rows, but only one should be returned - // more than 1 response is indicative of a query problem - state, err := db.GetTemplatePrebuildState(ctx, template.ID) - if err != nil { - return nil, xerrors.Errorf("failed to retrieve template's prebuild state: %w", err) - } - +func (c Controller) calculateActions(ctx context.Context, template database.Template, state database.GetTemplatePrebuildStateRow) (*reconciliationActions, error) { toCreate := int(math.Max(0, float64(state.Desired-(state.Actual+state.Starting)))) toDelete := int(math.Max(0, float64(state.Extraneous-state.Deleting-state.Stopping))) @@ -188,6 +181,8 @@ func (c Controller) calculeActions(ctx context.Context, db database.Store, templ slog.F("template_id", template.ID), slog.F("running", len(runningIDs)), slog.F("to_destroy", toDelete)) } + // TODO: implement lookup to not perform same action on workspace multiple times in $period + // i.e. a workspace cannot be deleted for some reason, which continually makes it eligible for deletion for i := 0; i < toDelete; i++ { if i >= len(runningIDs) { // Above warning will have already addressed this. @@ -222,35 +217,51 @@ func (c Controller) reconcileTemplate(ctx context.Context, template database.Tem innerCtx, cancel := context.WithTimeout(ctx, time.Second*30) defer cancel() - actions, err := c.calculeActions(innerCtx, db, template) + // TODO: change to "many" and return 1 or more rows, but only one should be returned + // more than 1 response is indicative of a query problem + state, err := db.GetTemplatePrebuildState(ctx, template.ID) + if err != nil { + return xerrors.Errorf("failed to retrieve template's prebuild states: %w", err) + } - logger.Info(innerCtx, "template prebuild state retrieved", + actions, err := c.calculateActions(innerCtx, template, state) + if err != nil { + logger.Error(ctx, "failed to calculate reconciliation actions", slog.Error(err)) + return nil + } + + // TODO: authz // Can't use existing profiles (i.e. AsSystemRestricted) because of dbauthz rules + var ownerCtx = dbauthz.As(ctx, rbac.Subject{ + ID: "owner", + Roles: rbac.RoleIdentifiers{rbac.RoleOwner()}, + Groups: []string{}, + Scope: rbac.ExpandableScope(rbac.ScopeAll), + }) + + levelFn := logger.Debug + if len(actions.createIDs) > 0 || len(actions.deleteIDs) > 0 { + // Only log with info level when there's a change that needs to be effected. + levelFn = logger.Info + } + levelFn(innerCtx, "template prebuild state retrieved", slog.F("to_create", len(actions.createIDs)), slog.F("to_destroy", len(actions.deleteIDs)), slog.F("desired", actions.meta.Desired), slog.F("actual", actions.meta.Actual), slog.F("extraneous", actions.meta.Extraneous), slog.F("starting", actions.meta.Starting), slog.F("stopping", actions.meta.Stopping), slog.F("deleting", actions.meta.Deleting)) + // Provision workspaces within the same tx so we don't get any timing issues here. + // i.e. we hold the advisory lock until all reconciliatory actions have been taken. + // TODO: max per reconciliation iteration? + for _, id := range actions.createIDs { - // Provision workspaces within the same tx so we don't get any timing issues here. - // i.e. we hold the advisory lock until all reconciliatory actions have been taken. - - // TODO: loop - // TODO: max per reconciliation iteration? - - // TODO: authz // Can't use existing profiles (i.e. AsSystemRestricted) because of dbauthz rules - var ownerCtx = dbauthz.As(ctx, rbac.Subject{ - ID: "owner", - Roles: rbac.RoleIdentifiers{rbac.RoleOwner()}, - Groups: []string{}, - Scope: rbac.ExpandableScope(rbac.ScopeAll), - }) - - if err := c.provision(ownerCtx, template, id, db); err != nil { - logger.Error(ctx, "failed to provision prebuild", slog.Error(err)) + if err := c.createPrebuild(ownerCtx, db, id, template); err != nil { + logger.Error(ctx, "failed to create prebuild", slog.Error(err)) } } + for _, id := range actions.deleteIDs { - // TODO: actually delete - logger.Info(ctx, "would've deleted prebuild", slog.F("workspace_id", id)) + if err := c.deletePrebuild(ownerCtx, db, id, template); err != nil { + logger.Error(ctx, "failed to delete prebuild", slog.Error(err)) + } } return nil @@ -266,11 +277,7 @@ func (c Controller) reconcileTemplate(ctx context.Context, template database.Tem return nil } -func (c Controller) provision(ctx context.Context, template database.Template, prebuildID uuid.UUID, db database.Store) error { - var ( - provisionerJob *database.ProvisionerJob - ) - +func (c Controller) createPrebuild(ctx context.Context, db database.Store, prebuildID uuid.UUID, template database.Template) error { name := fmt.Sprintf("prebuild-%s", prebuildID) now := dbtime.Now() @@ -296,14 +303,30 @@ func (c Controller) provision(ctx context.Context, template database.Template, p return xerrors.Errorf("get workspace by ID: %w", err) } - builder := wsbuilder.New(workspace, database.WorkspaceTransitionStart). + c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name), slog.F("workspace_id", prebuildID.String())) + + return c.provision(ctx, db, prebuildID, template, database.WorkspaceTransitionStart, workspace) +} +func (c Controller) deletePrebuild(ctx context.Context, db database.Store, prebuildID uuid.UUID, template database.Template) error { + workspace, err := db.GetWorkspaceByID(ctx, prebuildID) + if err != nil { + return xerrors.Errorf("get workspace by ID: %w", err) + } + + c.logger.Info(ctx, "attempting to delete prebuild", slog.F("workspace_id", prebuildID.String())) + + return c.provision(ctx, db, prebuildID, template, database.WorkspaceTransitionDelete, workspace) +} + +func (c Controller) provision(ctx context.Context, db database.Store, prebuildID uuid.UUID, template database.Template, transition database.WorkspaceTransition, workspace database.Workspace) error { + builder := wsbuilder.New(workspace, transition). Reason(database.BuildReasonInitiator). Initiator(PrebuildOwnerUUID). ActiveVersion(). VersionID(template.ActiveVersionID) // RichParameterValues(req.RichParameterValues) // TODO: fetch preset's params - _, provisionerJob, _, err = builder.Build( + _, provisionerJob, _, err := builder.Build( ctx, db, func(action policy.Action, object rbac.Objecter) bool { @@ -321,8 +344,8 @@ func (c Controller) provision(ctx context.Context, template database.Template, p c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err)) } - c.logger.Info(ctx, "provisioned new prebuild", slog.F("prebuild_id", prebuildID.String()), - slog.F("job_id", provisionerJob.ID)) + c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition), + slog.F("prebuild_id", prebuildID.String()), slog.F("job_id", provisionerJob.ID)) return nil }