coder/coderd/autobuild/lifecycle_executor.go

package autobuild

import (
	"context"
	"database/sql"
	"fmt"
	"net/http"
	"slices"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/dustin/go-humanize"
	"github.com/google/uuid"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
	"golang.org/x/sync/errgroup"
	"golang.org/x/xerrors"

	"cdr.dev/slog"
	"github.com/coder/coder/v2/coderd/files"

	"github.com/coder/coder/v2/coderd/audit"
	"github.com/coder/coder/v2/coderd/database"
	"github.com/coder/coder/v2/coderd/database/dbauthz"
	"github.com/coder/coder/v2/coderd/database/dbtime"
	"github.com/coder/coder/v2/coderd/database/provisionerjobs"
	"github.com/coder/coder/v2/coderd/database/pubsub"
	"github.com/coder/coder/v2/coderd/notifications"
	"github.com/coder/coder/v2/coderd/schedule"
	"github.com/coder/coder/v2/coderd/wsbuilder"
	"github.com/coder/coder/v2/codersdk"
)

// Executor automatically starts or stops workspaces.
type Executor struct {
	ctx                   context.Context
	db                    database.Store
	ps                    pubsub.Pubsub
	fileCache             *files.Cache
	templateScheduleStore *atomic.Pointer[schedule.TemplateScheduleStore]
	accessControlStore    *atomic.Pointer[dbauthz.AccessControlStore]
	auditor               *atomic.Pointer[audit.Auditor]
	log                   slog.Logger
	tick                  <-chan time.Time
	statsCh               chan<- Stats
	// NotificationsEnqueuer handles enqueueing notifications for delivery by SMTP, webhook, etc.
	notificationsEnqueuer notifications.Enqueuer
	reg                   prometheus.Registerer
	experiments           codersdk.Experiments

	metrics executorMetrics
}

type executorMetrics struct {
	autobuildExecutionDuration prometheus.Histogram
}

// Stats contains information about one run of Executor.
type Stats struct {
	Transitions map[uuid.UUID]database.WorkspaceTransition
	Elapsed     time.Duration
	Errors      map[uuid.UUID]error
}

// New returns a new wsactions executor.
func NewExecutor(ctx context.Context, db database.Store, ps pubsub.Pubsub, fc *files.Cache, reg prometheus.Registerer, tss *atomic.Pointer[schedule.TemplateScheduleStore], auditor *atomic.Pointer[audit.Auditor], acs *atomic.Pointer[dbauthz.AccessControlStore], log slog.Logger, tick <-chan time.Time, enqueuer notifications.Enqueuer, exp codersdk.Experiments) *Executor {
	factory := promauto.With(reg)
	le := &Executor{
		//nolint:gocritic // Autostart has a limited set of permissions.
		ctx:                   dbauthz.AsAutostart(ctx),
		db:                    db,
		ps:                    ps,
		fileCache:             fc,
		templateScheduleStore: tss,
		tick:                  tick,
		log:                   log.Named("autobuild"),
		auditor:               auditor,
		accessControlStore:    acs,
		notificationsEnqueuer: enqueuer,
		reg:                   reg,
		experiments:           exp,
		metrics: executorMetrics{
			autobuildExecutionDuration: factory.NewHistogram(prometheus.HistogramOpts{
				Namespace: "coderd",
				Subsystem: "lifecycle",
				Name:      "autobuild_execution_duration_seconds",
				Help:      "Duration of each autobuild execution.",
				Buckets:   prometheus.DefBuckets,
			}),
		},
	}
	return le
}

// WithStatsChannel will cause Executor to push a RunStats to ch after
// every tick.
func (e *Executor) WithStatsChannel(ch chan<- Stats) *Executor {
	e.statsCh = ch
	return e
}

// Run will cause executor to start or stop workspaces on every
// tick from its channel. It will stop when its context is Done, or when
// its channel is closed.
func (e *Executor) Run() {
	go func() {
		for {
			select {
			case <-e.ctx.Done():
				return
			case t, ok := <-e.tick:
				if !ok {
					return
				}
				stats := e.runOnce(t)
				e.metrics.autobuildExecutionDuration.Observe(stats.Elapsed.Seconds())
				if e.statsCh != nil {
					select {
					case <-e.ctx.Done():
						return
					case e.statsCh <- stats:
					}
				}
				e.log.Debug(e.ctx, "run stats", slog.F("elapsed", stats.Elapsed), slog.F("transitions", stats.Transitions))
			}
		}
	}()
}

func (e *Executor) runOnce(t time.Time) Stats {
	stats := Stats{
		Transitions: make(map[uuid.UUID]database.WorkspaceTransition),
		Errors:      make(map[uuid.UUID]error),
	}
	// we build the map of transitions concurrently, so need a mutex to serialize writes to the map
	statsMu := sync.Mutex{}
	defer func() {
		stats.Elapsed = time.Since(t)
	}()
	currentTick := t.Truncate(time.Minute)

	// TTL is set at the workspace level, and deadline at the workspace build level.
	// When a workspace build is created, its deadline initially starts at zero.
	// When provisionerd successfully completes a provision job, the deadline is
	// set to now + TTL if the associated workspace has a TTL set. This deadline
	// is what we compare against when performing autostop operations, rounded down
	// to the minute.
	//
	// NOTE: If a workspace build is created with a given TTL and then the user either
	//       changes or unsets the TTL, the deadline for the workspace build will not
	//       have changed. This behavior is as expected per #2229.
	workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, currentTick)
	if err != nil {
		e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
		return stats
	}

	// Sort the workspaces by build template version ID so that we can group
	// identical template versions together. This is a slight (and imperfect)
	// optimization.
	//
	// `wsbuilder` needs to load the terraform files for a given template version
	// into memory. If 2 workspaces are using the same template version, they will
	// share the same files in the FileCache. This only happens if the builds happen
	// in parallel.
	// TODO: Actually make sure the cache has the files in the cache for the full
	//  set of identical template versions. Then unload the files when the builds
	//  are done. Right now, this relies on luck for the 10 goroutine workers to
	//  overlap and keep the file reference in the cache alive.
	slices.SortFunc(workspaces, func(a, b database.GetWorkspacesEligibleForTransitionRow) int {
		return strings.Compare(a.BuildTemplateVersionID.UUID.String(), b.BuildTemplateVersionID.UUID.String())
	})

	// We only use errgroup here for convenience of API, not for early
	// cancellation. This means we only return nil errors in th eg.Go.
	eg := errgroup.Group{}
	// Limit the concurrency to avoid overloading the database.
	eg.SetLimit(10)

	for _, ws := range workspaces {
		wsID := ws.ID
		wsName := ws.Name
		log := e.log.With(
			slog.F("workspace_id", wsID),
			slog.F("workspace_name", wsName),
		)

		eg.Go(func() error {
			err := func() error {
				var (
					job                   *database.ProvisionerJob
					auditLog              *auditParams
					shouldNotifyDormancy  bool
					nextBuild             *database.WorkspaceBuild
					activeTemplateVersion database.TemplateVersion
					ws                    database.Workspace
					tmpl                  database.Template
					didAutoUpdate         bool
				)
				err := e.db.InTx(func(tx database.Store) error {
					var err error

					ok, err := tx.TryAcquireLock(e.ctx, database.GenLockID(fmt.Sprintf("lifecycle-executor:%s", wsID)))
					if err != nil {
						return xerrors.Errorf("try acquire lifecycle executor lock: %w", err)
					}
					if !ok {
						log.Debug(e.ctx, "unable to acquire lock for workspace, skipping")
						return nil
					}

					// Re-check eligibility since the first check was outside the
					// transaction and the workspace settings may have changed.
					ws, err = tx.GetWorkspaceByID(e.ctx, wsID)
					if err != nil {
						return xerrors.Errorf("get workspace by id: %w", err)
					}

					user, err := tx.GetUserByID(e.ctx, ws.OwnerID)
					if err != nil {
						return xerrors.Errorf("get user by id: %w", err)
					}

					// Determine the workspace state based on its latest build.
					latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
					if err != nil {
						return xerrors.Errorf("get latest workspace build: %w", err)
					}

					latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
					if err != nil {
						return xerrors.Errorf("get latest provisioner job: %w", err)
					}

					templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
					if err != nil {
						return xerrors.Errorf("get template scheduling options: %w", err)
					}

					// If next start at is not valid we need to re-compute it
					if !ws.NextStartAt.Valid && ws.AutostartSchedule.Valid {
						next, err := schedule.NextAllowedAutostart(currentTick, ws.AutostartSchedule.String, templateSchedule)
						if err == nil {
							nextStartAt := sql.NullTime{Valid: true, Time: dbtime.Time(next.UTC())}
							if err = tx.UpdateWorkspaceNextStartAt(e.ctx, database.UpdateWorkspaceNextStartAtParams{
								ID:          wsID,
								NextStartAt: nextStartAt,
							}); err != nil {
								return xerrors.Errorf("update workspace next start at: %w", err)
							}

							// Save re-fetching the workspace
							ws.NextStartAt = nextStartAt
						}
					}

					tmpl, err = tx.GetTemplateByID(e.ctx, ws.TemplateID)
					if err != nil {
						return xerrors.Errorf("get template by ID: %w", err)
					}

					activeTemplateVersion, err = tx.GetTemplateVersionByID(e.ctx, tmpl.ActiveVersionID)
					if err != nil {
						return xerrors.Errorf("get active template version by ID: %w", err)
					}

					accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(tmpl)

					nextTransition, reason, err := getNextTransition(user, ws, latestBuild, latestJob, templateSchedule, currentTick)
					if err != nil {
						log.Debug(e.ctx, "skipping workspace", slog.Error(err))
						// err is used to indicate that a workspace is not eligible
						// so returning nil here is ok although ultimately the distinction
						// doesn't matter since the transaction is  read-only up to
						// this point.
						return nil
					}

					if nextTransition != "" {
						builder := wsbuilder.New(ws, nextTransition).
							SetLastWorkspaceBuildInTx(&latestBuild).
							SetLastWorkspaceBuildJobInTx(&latestJob).
							Experiments(e.experiments).
							Reason(reason)
						log.Debug(e.ctx, "auto building workspace", slog.F("transition", nextTransition))
						if nextTransition == database.WorkspaceTransitionStart &&
							useActiveVersion(accessControl, ws) {
							log.Debug(e.ctx, "autostarting with active version")
							builder = builder.ActiveVersion()

							if latestBuild.TemplateVersionID != tmpl.ActiveVersionID {
								// control flag to know if the workspace was auto-updated,
								// so the lifecycle executor can notify the user
								didAutoUpdate = true
							}
						}

						nextBuild, job, _, err = builder.Build(e.ctx, tx, e.fileCache, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
						if err != nil {
							return xerrors.Errorf("build workspace with transition %q: %w", nextTransition, err)
						}
					}

					// Transition the workspace to dormant if it has breached the template's
					// threshold for inactivity.
					if reason == database.BuildReasonDormancy {
						wsOld := ws
						wsNew, err := tx.UpdateWorkspaceDormantDeletingAt(e.ctx, database.UpdateWorkspaceDormantDeletingAtParams{
							ID: ws.ID,
							DormantAt: sql.NullTime{
								Time:  dbtime.Now(),
								Valid: true,
							},
						})
						if err != nil {
							return xerrors.Errorf("update workspace dormant deleting at: %w", err)
						}

						auditLog = &auditParams{
							Old: wsOld.WorkspaceTable(),
							New: wsNew,
						}
						// To keep the `ws` accurate without doing a sql fetch
						ws.DormantAt = wsNew.DormantAt

						shouldNotifyDormancy = true

						log.Info(e.ctx, "dormant workspace",
							slog.F("last_used_at", ws.LastUsedAt),
							slog.F("time_til_dormant", templateSchedule.TimeTilDormant),
							slog.F("since_last_used_at", time.Since(ws.LastUsedAt)),
						)
					}

					if reason == database.BuildReasonAutodelete {
						log.Info(e.ctx, "deleted workspace",
							slog.F("dormant_at", ws.DormantAt.Time),
							slog.F("time_til_dormant_autodelete", templateSchedule.TimeTilDormantAutoDelete),
						)
					}

					if nextTransition == "" {
						return nil
					}

					statsMu.Lock()
					stats.Transitions[ws.ID] = nextTransition
					statsMu.Unlock()

					log.Info(e.ctx, "scheduling workspace transition",
						slog.F("transition", nextTransition),
						slog.F("reason", reason),
					)

					return nil

					// Run with RepeatableRead isolation so that the build process sees the same data
					// as our calculation that determines whether an autobuild is necessary.
				}, &database.TxOptions{
					Isolation:    sql.LevelRepeatableRead,
					TxIdentifier: "lifecycle",
				})
				if auditLog != nil {
					// If the transition didn't succeed then updating the workspace
					// to indicate dormant didn't either.
					auditLog.Success = err == nil
					auditBuild(e.ctx, log, *e.auditor.Load(), *auditLog)
				}
				if didAutoUpdate && err == nil {
					nextBuildReason := ""
					if nextBuild != nil {
						nextBuildReason = string(nextBuild.Reason)
					}

					templateVersionMessage := activeTemplateVersion.Message
					if templateVersionMessage == "" {
						templateVersionMessage = "None provided"
					}

					if _, err := e.notificationsEnqueuer.Enqueue(e.ctx, ws.OwnerID, notifications.TemplateWorkspaceAutoUpdated,
						map[string]string{
							"name":                     ws.Name,
							"initiator":                "autobuild",
							"reason":                   nextBuildReason,
							"template_version_name":    activeTemplateVersion.Name,
							"template_version_message": templateVersionMessage,
						}, "autobuild",
						// Associate this notification with all the related entities.
						ws.ID, ws.OwnerID, ws.TemplateID, ws.OrganizationID,
					); err != nil {
						log.Warn(e.ctx, "failed to notify of autoupdated workspace", slog.Error(err))
					}
				}
				if err != nil {
					return xerrors.Errorf("transition workspace: %w", err)
				}
				if job != nil {
					// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
					// with an outer transaction like this, and we need to make sure the outer transaction commits before
					// posting the job.  If we post before the transaction commits, provisionerd might try to acquire the
					// job, fail, and then sit idle instead of picking up the job.
					err = provisionerjobs.PostJob(e.ps, *job)
					if err != nil {
						return xerrors.Errorf("post provisioner job to pubsub: %w", err)
					}
				}
				if shouldNotifyDormancy {
					dormantTime := dbtime.Now().Add(time.Duration(tmpl.TimeTilDormant))
					_, err = e.notificationsEnqueuer.Enqueue(
						e.ctx,
						ws.OwnerID,
						notifications.TemplateWorkspaceDormant,
						map[string]string{
							"name":           ws.Name,
							"reason":         "inactivity exceeded the dormancy threshold",
							"timeTilDormant": humanize.Time(dormantTime),
						},
						"lifecycle_executor",
						ws.ID,
						ws.OwnerID,
						ws.TemplateID,
						ws.OrganizationID,
					)
					if err != nil {
						log.Warn(e.ctx, "failed to notify of workspace marked as dormant", slog.Error(err), slog.F("workspace_id", ws.ID))
					}
				}
				return nil
			}()
			if err != nil && !xerrors.Is(err, context.Canceled) {
				log.Error(e.ctx, "failed to transition workspace", slog.Error(err))
				statsMu.Lock()
				stats.Errors[wsID] = err
				statsMu.Unlock()
			}
			// Even though we got an error we still return nil  to avoid
			// short-circuiting the evaluation loop.
			return nil
		})
	}

	// This should not happen since we don't want early cancellation.
	err = eg.Wait()
	if err != nil {
		e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
	}

	return stats
}

// getNextTransition returns the next eligible transition for the workspace
// as well as the reason for why it is transitioning. It is possible
// for this function to return a nil error as well as an empty transition.
// In such cases it means no provisioning should occur but the workspace
// may be "transitioning" to a new state (such as an inactive, stopped
// workspace transitioning to the dormant state).
func getNextTransition(
	user database.User,
	ws database.Workspace,
	latestBuild database.WorkspaceBuild,
	latestJob database.ProvisionerJob,
	templateSchedule schedule.TemplateScheduleOptions,
	currentTick time.Time,
) (
	database.WorkspaceTransition,
	database.BuildReason,
	error,
) {
	switch {
	case isEligibleForAutostop(user, ws, latestBuild, latestJob, currentTick):
		return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
	case isEligibleForAutostart(user, ws, latestBuild, latestJob, templateSchedule, currentTick):
		return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
	case isEligibleForFailedStop(latestBuild, latestJob, templateSchedule, currentTick):
		return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
	case isEligibleForDormantStop(ws, templateSchedule, currentTick):
		// Only stop started workspaces.
		if latestBuild.Transition == database.WorkspaceTransitionStart {
			return database.WorkspaceTransitionStop, database.BuildReasonDormancy, nil
		}
		// We shouldn't transition the workspace but we should still
		// make it dormant.
		return "", database.BuildReasonDormancy, nil

	case isEligibleForDelete(ws, templateSchedule, latestBuild, latestJob, currentTick):
		return database.WorkspaceTransitionDelete, database.BuildReasonAutodelete, nil
	default:
		return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
	}
}

// isEligibleForAutostart returns true if the workspace should be autostarted.
func isEligibleForAutostart(user database.User, ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
	// Don't attempt to autostart workspaces for suspended users.
	if user.Status != database.UserStatusActive {
		return false
	}

	// Don't attempt to autostart failed workspaces.
	if job.JobStatus == database.ProvisionerJobStatusFailed {
		return false
	}

	// If the workspace is dormant we should not autostart it.
	if ws.DormantAt.Valid {
		return false
	}

	// If the last transition for the workspace was not 'stop' then the workspace
	// cannot be started.
	if build.Transition != database.WorkspaceTransitionStop {
		return false
	}

	// If autostart isn't enabled, or the schedule isn't valid/populated we can't
	// autostart the workspace.
	if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
		return false
	}

	nextTransition, err := schedule.NextAllowedAutostart(build.CreatedAt, ws.AutostartSchedule.String, templateSchedule)
	if err != nil {
		return false
	}

	// Must use '.Before' vs '.After' so equal times are considered "valid for autostart".
	return !currentTick.Before(nextTransition)
}

// isEligibleForAutostop returns true if the workspace should be autostopped.
func isEligibleForAutostop(user database.User, ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
	if job.JobStatus == database.ProvisionerJobStatusFailed {
		return false
	}

	// If the workspace is dormant we should not autostop it.
	if ws.DormantAt.Valid {
		return false
	}

	if build.Transition == database.WorkspaceTransitionStart && user.Status == database.UserStatusSuspended {
		return true
	}

	// A workspace must be started in order for it to be auto-stopped.
	return build.Transition == database.WorkspaceTransitionStart &&
		!build.Deadline.IsZero() &&
		// We do not want to stop a workspace prior to it breaching its deadline.
		!currentTick.Before(build.Deadline)
}

// isEligibleForDormantStop returns true if the workspace should be dormant
// for breaching the inactivity threshold of the template.
func isEligibleForDormantStop(ws database.Workspace, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
	// Only attempt against workspaces not already dormant.
	return !ws.DormantAt.Valid &&
		// The template must specify an time_til_dormant value.
		templateSchedule.TimeTilDormant > 0 &&
		// The workspace must breach the time_til_dormant value.
		currentTick.Sub(ws.LastUsedAt) > templateSchedule.TimeTilDormant
}

func isEligibleForDelete(ws database.Workspace, templateSchedule schedule.TemplateScheduleOptions, lastBuild database.WorkspaceBuild, lastJob database.ProvisionerJob, currentTick time.Time) bool {
	eligible := ws.DormantAt.Valid && ws.DeletingAt.Valid &&
		// Dormant workspaces should only be deleted if a time_til_dormant_autodelete value is specified.
		templateSchedule.TimeTilDormantAutoDelete > 0 &&
		// The workspace must breach the time_til_dormant_autodelete value.
		currentTick.After(ws.DeletingAt.Time)

	// If the last delete job failed we should wait 24 hours before trying again.
	// Builds are resource-intensive so retrying every minute is not productive
	// and will hold compute hostage.
	if lastBuild.Transition == database.WorkspaceTransitionDelete && lastJob.JobStatus == database.ProvisionerJobStatusFailed {
		return eligible && lastJob.Finished() && currentTick.Sub(lastJob.FinishedAt()) > time.Hour*24
	}

	return eligible
}

// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
// due to a failed build.
func isEligibleForFailedStop(build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
	// If the template has specified a failure TLL.
	return templateSchedule.FailureTTL > 0 &&
		// And the job resulted in failure.
		job.JobStatus == database.ProvisionerJobStatusFailed &&
		build.Transition == database.WorkspaceTransitionStart &&
		// And sufficient time has elapsed since the job has completed.
		job.CompletedAt.Valid &&
		currentTick.Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL
}

type auditParams struct {
	Old     database.WorkspaceTable
	New     database.WorkspaceTable
	Success bool
}

func auditBuild(ctx context.Context, log slog.Logger, auditor audit.Auditor, params auditParams) {
	status := http.StatusInternalServerError
	if params.Success {
		status = http.StatusOK
	}

	audit.BackgroundAudit(ctx, &audit.BackgroundAuditParams[database.WorkspaceTable]{
		Audit:          auditor,
		Log:            log,
		UserID:         params.New.OwnerID,
		OrganizationID: params.New.OrganizationID,
		// Right now there's no request associated with an autobuild
		// operation.
		RequestID: uuid.Nil,
		Action:    database.AuditActionWrite,
		Old:       params.Old,
		New:       params.New,
		Status:    status,
	})
}

func useActiveVersion(opts dbauthz.TemplateAccessControl, ws database.Workspace) bool {
	return opts.RequireActiveVersion || ws.AutomaticUpdates == database.AutomaticUpdatesAlways
}