Files
coder/coderd/autobuild/lifecycle_executor.go
Steven Masley 7b152cdd91 chore: increase fileCache hit rate in autobuilds lifecycle (#18507)
`wsbuilder` hits the file cache when running validation. This solution is imperfect, but by first sorting workspaces by their template version id, the cache hit rate should improve.
2025-06-24 07:36:39 -05:00

625 lines
23 KiB
Go

package autobuild
import (
"context"
"database/sql"
"fmt"
"net/http"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/dustin/go-humanize"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/files"
"github.com/coder/coder/v2/coderd/audit"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/schedule"
"github.com/coder/coder/v2/coderd/wsbuilder"
"github.com/coder/coder/v2/codersdk"
)
// Executor automatically starts or stops workspaces.
type Executor struct {
ctx context.Context
db database.Store
ps pubsub.Pubsub
fileCache *files.Cache
templateScheduleStore *atomic.Pointer[schedule.TemplateScheduleStore]
accessControlStore *atomic.Pointer[dbauthz.AccessControlStore]
auditor *atomic.Pointer[audit.Auditor]
log slog.Logger
tick <-chan time.Time
statsCh chan<- Stats
// NotificationsEnqueuer handles enqueueing notifications for delivery by SMTP, webhook, etc.
notificationsEnqueuer notifications.Enqueuer
reg prometheus.Registerer
experiments codersdk.Experiments
metrics executorMetrics
}
type executorMetrics struct {
autobuildExecutionDuration prometheus.Histogram
}
// Stats contains information about one run of Executor.
type Stats struct {
Transitions map[uuid.UUID]database.WorkspaceTransition
Elapsed time.Duration
Errors map[uuid.UUID]error
}
// New returns a new wsactions executor.
func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments) *Executor {
factory := promauto.With(reg)
le := &Executor{
//nolint:gocritic // Autostart has a limited set of permissions.
ctx: dbauthz.AsAutostart(ctx),
db: db,
ps: ps,
fileCache: fc,
templateScheduleStore: tss,
tick: tick,
log: log.Named("autobuild"),
auditor: auditor,
accessControlStore: acs,
notificationsEnqueuer: enqueuer,
reg: reg,
experiments: exp,
metrics: executorMetrics{
autobuildExecutionDuration: factory.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "lifecycle",
Name: "autobuild_execution_duration_seconds",
Help: "Duration of each autobuild execution.",
Buckets: prometheus.DefBuckets,
}),
},
}
return le
}
// WithStatsChannel will cause Executor to push a RunStats to ch after
// every tick.
func (e *Executor) WithStatsChannel(ch chan<- Stats) *Executor {
e.statsCh = ch
return e
}
// Run will cause executor to start or stop workspaces on every
// tick from its channel. It will stop when its context is Done, or when
// its channel is closed.
func (e *Executor) Run() {
go func() {
for {
select {
case <-e.ctx.Done():
return
case t, ok := <-e.tick:
if !ok {
return
}
stats := e.runOnce(t)
e.metrics.autobuildExecutionDuration.Observe(stats.Elapsed.Seconds())
if e.statsCh != nil {
select {
case <-e.ctx.Done():
return
case e.statsCh <- stats:
}
}
e.log.Debug(e.ctx, "run stats", slog.F("elapsed", stats.Elapsed), slog.F("transitions", stats.Transitions))
}
}
}()
}
func (e *Executor) runOnce(t time.Time) Stats {
stats := Stats{
Transitions: make(map[uuid.UUID]database.WorkspaceTransition),
Errors: make(map[uuid.UUID]error),
}
// we build the map of transitions concurrently, so need a mutex to serialize writes to the map
statsMu := sync.Mutex{}
defer func() {
stats.Elapsed = time.Since(t)
}()
currentTick := t.Truncate(time.Minute)
// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, currentTick)
if err != nil {
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
return stats
}
// Sort the workspaces by build template version ID so that we can group
// identical template versions together. This is a slight (and imperfect)
// optimization.
//
// `wsbuilder` needs to load the terraform files for a given template version
// into memory. If 2 workspaces are using the same template version, they will
// share the same files in the FileCache. This only happens if the builds happen
// in parallel.
// TODO: Actually make sure the cache has the files in the cache for the full
// set of identical template versions. Then unload the files when the builds
// are done. Right now, this relies on luck for the 10 goroutine workers to
// overlap and keep the file reference in the cache alive.
slices.SortFunc(workspaces, func(a, b database.GetWorkspacesEligibleForTransitionRow) int {
return strings.Compare(a.BuildTemplateVersionID.UUID.String(), b.BuildTemplateVersionID.UUID.String())
})
// We only use errgroup here for convenience of API, not for early
// cancellation. This means we only return nil errors in th eg.Go.
eg := errgroup.Group{}
// Limit the concurrency to avoid overloading the database.
eg.SetLimit(10)
for _, ws := range workspaces {
wsID := ws.ID
wsName := ws.Name
log := e.log.With(
slog.F("workspace_id", wsID),
slog.F("workspace_name", wsName),
)
eg.Go(func() error {
err := func() error {
var (
job *database.ProvisionerJob
auditLog *auditParams
shouldNotifyDormancy bool
nextBuild *database.WorkspaceBuild
activeTemplateVersion database.TemplateVersion
ws database.Workspace
tmpl database.Template
didAutoUpdate bool
)
err := e.db.InTx(func(tx database.Store) error {
var err error
ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID)))
if err != nil {
return xerrors.Errorf("try acquire lifecycle executor lock: %w", err)
}
if !ok {
log.Debug(e.ctx, "unable to acquire lock for workspace, skipping")
return nil
}
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err = tx.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
user, err := tx.GetUserByID(e.ctx, ws.OwnerID)
if err != nil {
return xerrors.Errorf("get user by id: %w", err)
}
// Determine the workspace state based on its latest build.
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
return xerrors.Errorf("get latest workspace build: %w", err)
}
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
if err != nil {
return xerrors.Errorf("get latest provisioner job: %w", err)
}
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
if err != nil {
return xerrors.Errorf("get template scheduling options: %w", err)
}
// If next start at is not valid we need to re-compute it
if !ws.NextStartAt.Valid && ws.AutostartSchedule.Valid {
next, err := schedule.NextAllowedAutostart(currentTick, ws.AutostartSchedule.String, templateSchedule)
if err == nil {
nextStartAt := sql.NullTime{Valid: true, Time: dbtime.Time(next.UTC())}
if err = tx.UpdateWorkspaceNextStartAt(e.ctx, database.UpdateWorkspaceNextStartAtParams{
ID: wsID,
NextStartAt: nextStartAt,
}); err != nil {
return xerrors.Errorf("update workspace next start at: %w", err)
}
// Save re-fetching the workspace
ws.NextStartAt = nextStartAt
}
}
tmpl, err = tx.GetTemplateByID(e.ctx, ws.TemplateID)
if err != nil {
return xerrors.Errorf("get template by ID: %w", err)
}
activeTemplateVersion, err = tx.GetTemplateVersionByID(e.ctx, tmpl.ActiveVersionID)
if err != nil {
return xerrors.Errorf("get active template version by ID: %w", err)
}
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(tmpl)
nextTransition, reason, err := getNextTransition(user, ws, latestBuild, latestJob, templateSchedule, currentTick)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
// err is used to indicate that a workspace is not eligible
// so returning nil here is ok although ultimately the distinction
// doesn't matter since the transaction is read-only up to
// this point.
return nil
}
if nextTransition != "" {
builder := wsbuilder.New(ws, nextTransition).
SetLastWorkspaceBuildInTx(&latestBuild).
SetLastWorkspaceBuildJobInTx(&latestJob).
Experiments(e.experiments).
Reason(reason)
log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition))
if nextTransition == database.WorkspaceTransitionStart &&
useActiveVersion(accessControl, ws) {
log.Debug(e.ctx, "autostarting with active version")
builder = builder.ActiveVersion()
if latestBuild.TemplateVersionID != tmpl.ActiveVersionID {
// control flag to know if the workspace was auto-updated,
// so the lifecycle executor can notify the user
didAutoUpdate = true
}
}
nextBuild, job, _, err = builder.Build(e.ctx, tx, e.fileCache, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
if err != nil {
return xerrors.Errorf("build workspace with transition %q: %w", nextTransition, err)
}
}
// Transition the workspace to dormant if it has breached the template's
// threshold for inactivity.
if reason == database.BuildReasonDormancy {
wsOld := ws
wsNew, err := tx.UpdateWorkspaceDormantDeletingAt(e.ctx, database.UpdateWorkspaceDormantDeletingAtParams{
ID: ws.ID,
DormantAt: sql.NullTime{
Time: dbtime.Now(),
Valid: true,
},
})
if err != nil {
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
}
auditLog = &auditParams{
Old: wsOld.WorkspaceTable(),
New: wsNew,
}
// To keep the `ws` accurate without doing a sql fetch
ws.DormantAt = wsNew.DormantAt
shouldNotifyDormancy = true
log.Info(e.ctx, "dormant workspace",
slog.F("last_used_at", ws.LastUsedAt),
slog.F("time_til_dormant", templateSchedule.TimeTilDormant),
slog.F("since_last_used_at", time.Since(ws.LastUsedAt)),
)
}
if reason == database.BuildReasonAutodelete {
log.Info(e.ctx, "deleted workspace",
slog.F("dormant_at", ws.DormantAt.Time),
slog.F("time_til_dormant_autodelete", templateSchedule.TimeTilDormantAutoDelete),
)
}
if nextTransition == "" {
return nil
}
statsMu.Lock()
stats.Transitions[ws.ID] = nextTransition
statsMu.Unlock()
log.Info(e.ctx, "scheduling workspace transition",
slog.F("transition", nextTransition),
slog.F("reason", reason),
)
return nil
// Run with RepeatableRead isolation so that the build process sees the same data
// as our calculation that determines whether an autobuild is necessary.
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
TxIdentifier: "lifecycle",
})
if auditLog != nil {
// If the transition didn't succeed then updating the workspace
// to indicate dormant didn't either.
auditLog.Success = err == nil
auditBuild(e.ctx, log, *e.auditor.Load(), *auditLog)
}
if didAutoUpdate && err == nil {
nextBuildReason := ""
if nextBuild != nil {
nextBuildReason = string(nextBuild.Reason)
}
templateVersionMessage := activeTemplateVersion.Message
if templateVersionMessage == "" {
templateVersionMessage = "None provided"
}
if _, err := e.notificationsEnqueuer.Enqueue(e.ctx, ws.OwnerID, notifications.TemplateWorkspaceAutoUpdated,
map[string]string{
"name": ws.Name,
"initiator": "autobuild",
"reason": nextBuildReason,
"template_version_name": activeTemplateVersion.Name,
"template_version_message": templateVersionMessage,
}, "autobuild",
// Associate this notification with all the related entities.
ws.ID, ws.OwnerID, ws.TemplateID, ws.OrganizationID,
); err != nil {
log.Warn(e.ctx, "failed to notify of autoupdated workspace", slog.Error(err))
}
}
if err != nil {
return xerrors.Errorf("transition workspace: %w", err)
}
if job != nil {
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
// with an outer transaction like this, and we need to make sure the outer transaction commits before
// posting the job. If we post before the transaction commits, provisionerd might try to acquire the
// job, fail, and then sit idle instead of picking up the job.
err = provisionerjobs.PostJob(e.ps, *job)
if err != nil {
return xerrors.Errorf("post provisioner job to pubsub: %w", err)
}
}
if shouldNotifyDormancy {
dormantTime := dbtime.Now().Add(time.Duration(tmpl.TimeTilDormant))
_, err = e.notificationsEnqueuer.Enqueue(
e.ctx,
ws.OwnerID,
notifications.TemplateWorkspaceDormant,
map[string]string{
"name": ws.Name,
"reason": "inactivity exceeded the dormancy threshold",
"timeTilDormant": humanize.Time(dormantTime),
},
"lifecycle_executor",
ws.ID,
ws.OwnerID,
ws.TemplateID,
ws.OrganizationID,
)
if err != nil {
log.Warn(e.ctx, "failed to notify of workspace marked as dormant", slog.Error(err), slog.F("workspace_id", ws.ID))
}
}
return nil
}()
if err != nil && !xerrors.Is(err, context.Canceled) {
log.Error(e.ctx, "failed to transition workspace", slog.Error(err))
statsMu.Lock()
stats.Errors[wsID] = err
statsMu.Unlock()
}
// Even though we got an error we still return nil to avoid
// short-circuiting the evaluation loop.
return nil
})
}
// This should not happen since we don't want early cancellation.
err = eg.Wait()
if err != nil {
e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
}
return stats
}
// getNextTransition returns the next eligible transition for the workspace
// as well as the reason for why it is transitioning. It is possible
// for this function to return a nil error as well as an empty transition.
// In such cases it means no provisioning should occur but the workspace
// may be "transitioning" to a new state (such as an inactive, stopped
// workspace transitioning to the dormant state).
func getNextTransition(
user database.User,
ws database.Workspace,
latestBuild database.WorkspaceBuild,
latestJob database.ProvisionerJob,
templateSchedule schedule.TemplateScheduleOptions,
currentTick time.Time,
) (
database.WorkspaceTransition,
database.BuildReason,
error,
) {
switch {
case isEligibleForAutostop(user, ws, latestBuild, latestJob, currentTick):
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
case isEligibleForAutostart(user, ws, latestBuild, latestJob, templateSchedule, currentTick):
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
case isEligibleForFailedStop(latestBuild, latestJob, templateSchedule, currentTick):
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
case isEligibleForDormantStop(ws, templateSchedule, currentTick):
// Only stop started workspaces.
if latestBuild.Transition == database.WorkspaceTransitionStart {
return database.WorkspaceTransitionStop, database.BuildReasonDormancy, nil
}
// We shouldn't transition the workspace but we should still
// make it dormant.
return "", database.BuildReasonDormancy, nil
case isEligibleForDelete(ws, templateSchedule, latestBuild, latestJob, currentTick):
return database.WorkspaceTransitionDelete, database.BuildReasonAutodelete, nil
default:
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
}
}
// isEligibleForAutostart returns true if the workspace should be autostarted.
func isEligibleForAutostart(user database.User, ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
// Don't attempt to autostart workspaces for suspended users.
if user.Status != database.UserStatusActive {
return false
}
// Don't attempt to autostart failed workspaces.
if job.JobStatus == database.ProvisionerJobStatusFailed {
return false
}
// If the workspace is dormant we should not autostart it.
if ws.DormantAt.Valid {
return false
}
// If the last transition for the workspace was not 'stop' then the workspace
// cannot be started.
if build.Transition != database.WorkspaceTransitionStop {
return false
}
// If autostart isn't enabled, or the schedule isn't valid/populated we can't
// autostart the workspace.
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
return false
}
nextTransition, err := schedule.NextAllowedAutostart(build.CreatedAt, ws.AutostartSchedule.String, templateSchedule)
if err != nil {
return false
}
// Must use '.Before' vs '.After' so equal times are considered "valid for autostart".
return !currentTick.Before(nextTransition)
}
// isEligibleForAutostop returns true if the workspace should be autostopped.
func isEligibleForAutostop(user database.User, ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
if job.JobStatus == database.ProvisionerJobStatusFailed {
return false
}
// If the workspace is dormant we should not autostop it.
if ws.DormantAt.Valid {
return false
}
if build.Transition == database.WorkspaceTransitionStart && user.Status == database.UserStatusSuspended {
return true
}
// A workspace must be started in order for it to be auto-stopped.
return build.Transition == database.WorkspaceTransitionStart &&
!build.Deadline.IsZero() &&
// We do not want to stop a workspace prior to it breaching its deadline.
!currentTick.Before(build.Deadline)
}
// isEligibleForDormantStop returns true if the workspace should be dormant
// for breaching the inactivity threshold of the template.
func isEligibleForDormantStop(ws database.Workspace, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
// Only attempt against workspaces not already dormant.
return !ws.DormantAt.Valid &&
// The template must specify an time_til_dormant value.
templateSchedule.TimeTilDormant > 0 &&
// The workspace must breach the time_til_dormant value.
currentTick.Sub(ws.LastUsedAt) > templateSchedule.TimeTilDormant
}
func isEligibleForDelete(ws database.Workspace, templateSchedule schedule.TemplateScheduleOptions, lastBuild database.WorkspaceBuild, lastJob database.ProvisionerJob, currentTick time.Time) bool {
eligible := ws.DormantAt.Valid && ws.DeletingAt.Valid &&
// Dormant workspaces should only be deleted if a time_til_dormant_autodelete value is specified.
templateSchedule.TimeTilDormantAutoDelete > 0 &&
// The workspace must breach the time_til_dormant_autodelete value.
currentTick.After(ws.DeletingAt.Time)
// If the last delete job failed we should wait 24 hours before trying again.
// Builds are resource-intensive so retrying every minute is not productive
// and will hold compute hostage.
if lastBuild.Transition == database.WorkspaceTransitionDelete && lastJob.JobStatus == database.ProvisionerJobStatusFailed {
return eligible && lastJob.Finished() && currentTick.Sub(lastJob.FinishedAt()) > time.Hour*24
}
return eligible
}
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
// due to a failed build.
func isEligibleForFailedStop(build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
// If the template has specified a failure TLL.
return templateSchedule.FailureTTL > 0 &&
// And the job resulted in failure.
job.JobStatus == database.ProvisionerJobStatusFailed &&
build.Transition == database.WorkspaceTransitionStart &&
// And sufficient time has elapsed since the job has completed.
job.CompletedAt.Valid &&
currentTick.Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL
}
type auditParams struct {
Old database.WorkspaceTable
New database.WorkspaceTable
Success bool
}
func auditBuild(ctx context.Context, log slog.Logger, auditor audit.Auditor, params auditParams) {
status := http.StatusInternalServerError
if params.Success {
status = http.StatusOK
}
audit.BackgroundAudit(ctx, &audit.BackgroundAuditParams[database.WorkspaceTable]{
Audit: auditor,
Log: log,
UserID: params.New.OwnerID,
OrganizationID: params.New.OrganizationID,
// Right now there's no request associated with an autobuild
// operation.
RequestID: uuid.Nil,
Action: database.AuditActionWrite,
Old: params.Old,
New: params.New,
Status: status,
})
}
func useActiveVersion(opts dbauthz.TemplateAccessControl, ws database.Workspace) bool {
return opts.RequireActiveVersion || ws.AutomaticUpdates == database.AutomaticUpdatesAlways
}