mirror of
https://github.com/coder/coder.git
synced 2025-07-09 11:45:56 +00:00
396 lines
11 KiB
Go
396 lines
11 KiB
Go
package jobreaper
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt" //#nosec // this is only used for shuffling an array to pick random jobs to unhang
|
|
"time"
|
|
|
|
"golang.org/x/xerrors"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
"cdr.dev/slog"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
|
"github.com/coder/coder/v2/coderd/database/pubsub"
|
|
"github.com/coder/coder/v2/provisionersdk"
|
|
)
|
|
|
|
const (
|
|
// HungJobDuration is the duration of time since the last update
|
|
// to a RUNNING job before it is considered hung.
|
|
HungJobDuration = 5 * time.Minute
|
|
|
|
// PendingJobDuration is the duration of time since last update
|
|
// to a PENDING job before it is considered dead.
|
|
PendingJobDuration = 30 * time.Minute
|
|
|
|
// HungJobExitTimeout is the duration of time that provisioners should allow
|
|
// for a graceful exit upon cancellation due to failing to send an update to
|
|
// a job.
|
|
//
|
|
// Provisioners should avoid keeping a job "running" for longer than this
|
|
// time after failing to send an update to the job.
|
|
HungJobExitTimeout = 3 * time.Minute
|
|
|
|
// MaxJobsPerRun is the maximum number of hung jobs that the detector will
|
|
// terminate in a single run.
|
|
MaxJobsPerRun = 10
|
|
)
|
|
|
|
// jobLogMessages are written to provisioner job logs when a job is reaped
|
|
func JobLogMessages(reapType ReapType, threshold time.Duration) []string {
|
|
return []string{
|
|
"",
|
|
"====================",
|
|
fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", reapType, threshold.Minutes()),
|
|
"====================",
|
|
"",
|
|
}
|
|
}
|
|
|
|
type jobToReap struct {
|
|
ID uuid.UUID
|
|
Threshold time.Duration
|
|
Type ReapType
|
|
}
|
|
|
|
type ReapType string
|
|
|
|
const (
|
|
Pending ReapType = "pending"
|
|
Hung ReapType = "hung"
|
|
)
|
|
|
|
// acquireLockError is returned when the detector fails to acquire a lock and
|
|
// cancels the current run.
|
|
type acquireLockError struct{}
|
|
|
|
// Error implements error.
|
|
func (acquireLockError) Error() string {
|
|
return "lock is held by another client"
|
|
}
|
|
|
|
// jobIneligibleError is returned when a job is not eligible to be terminated
|
|
// anymore.
|
|
type jobIneligibleError struct {
|
|
Err error
|
|
}
|
|
|
|
// Error implements error.
|
|
func (e jobIneligibleError) Error() string {
|
|
return fmt.Sprintf("job is no longer eligible to be terminated: %s", e.Err)
|
|
}
|
|
|
|
// Detector automatically detects hung provisioner jobs, sends messages into the
|
|
// build log and terminates them as failed.
|
|
type Detector struct {
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
done chan struct{}
|
|
|
|
db database.Store
|
|
pubsub pubsub.Pubsub
|
|
log slog.Logger
|
|
tick <-chan time.Time
|
|
stats chan<- Stats
|
|
}
|
|
|
|
// Stats contains statistics about the last run of the detector.
|
|
type Stats struct {
|
|
// TerminatedJobIDs contains the IDs of all jobs that were detected as hung and
|
|
// terminated.
|
|
TerminatedJobIDs []uuid.UUID
|
|
// Error is the fatal error that occurred during the last run of the
|
|
// detector, if any. Error may be set to AcquireLockError if the detector
|
|
// failed to acquire a lock.
|
|
Error error
|
|
}
|
|
|
|
// New returns a new job reaper.
|
|
func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector {
|
|
//nolint:gocritic // Job reaper has a limited set of permissions.
|
|
ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx))
|
|
d := &Detector{
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
done: make(chan struct{}),
|
|
db: db,
|
|
pubsub: pub,
|
|
log: log,
|
|
tick: tick,
|
|
stats: nil,
|
|
}
|
|
return d
|
|
}
|
|
|
|
// WithStatsChannel will cause Executor to push a RunStats to ch after
|
|
// every tick. This push is blocking, so if ch is not read, the detector will
|
|
// hang. This should only be used in tests.
|
|
func (d *Detector) WithStatsChannel(ch chan<- Stats) *Detector {
|
|
d.stats = ch
|
|
return d
|
|
}
|
|
|
|
// Start will cause the detector to detect and unhang provisioner jobs on every
|
|
// tick from its channel. It will stop when its context is Done, or when its
|
|
// channel is closed.
|
|
//
|
|
// Start should only be called once.
|
|
func (d *Detector) Start() {
|
|
go func() {
|
|
defer close(d.done)
|
|
defer d.cancel()
|
|
|
|
for {
|
|
select {
|
|
case <-d.ctx.Done():
|
|
return
|
|
case t, ok := <-d.tick:
|
|
if !ok {
|
|
return
|
|
}
|
|
stats := d.run(t)
|
|
if stats.Error != nil && !xerrors.As(stats.Error, &acquireLockError{}) {
|
|
d.log.Warn(d.ctx, "error running workspace build hang detector once", slog.Error(stats.Error))
|
|
}
|
|
if d.stats != nil {
|
|
select {
|
|
case <-d.ctx.Done():
|
|
return
|
|
case d.stats <- stats:
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// Wait will block until the detector is stopped.
|
|
func (d *Detector) Wait() {
|
|
<-d.done
|
|
}
|
|
|
|
// Close will stop the detector.
|
|
func (d *Detector) Close() {
|
|
d.cancel()
|
|
<-d.done
|
|
}
|
|
|
|
func (d *Detector) run(t time.Time) Stats {
|
|
ctx, cancel := context.WithTimeout(d.ctx, 5*time.Minute)
|
|
defer cancel()
|
|
|
|
stats := Stats{
|
|
TerminatedJobIDs: []uuid.UUID{},
|
|
Error: nil,
|
|
}
|
|
|
|
// Find all provisioner jobs to be reaped
|
|
jobs, err := d.db.GetProvisionerJobsToBeReaped(ctx, database.GetProvisionerJobsToBeReapedParams{
|
|
PendingSince: t.Add(-PendingJobDuration),
|
|
HungSince: t.Add(-HungJobDuration),
|
|
MaxJobs: MaxJobsPerRun,
|
|
})
|
|
if err != nil {
|
|
stats.Error = xerrors.Errorf("get provisioner jobs to be reaped: %w", err)
|
|
return stats
|
|
}
|
|
|
|
jobsToReap := make([]*jobToReap, 0, len(jobs))
|
|
|
|
for _, job := range jobs {
|
|
j := &jobToReap{
|
|
ID: job.ID,
|
|
}
|
|
if job.JobStatus == database.ProvisionerJobStatusPending {
|
|
j.Threshold = PendingJobDuration
|
|
j.Type = Pending
|
|
} else {
|
|
j.Threshold = HungJobDuration
|
|
j.Type = Hung
|
|
}
|
|
jobsToReap = append(jobsToReap, j)
|
|
}
|
|
|
|
// Send a message into the build log for each hung or pending job saying that it
|
|
// has been detected and will be terminated, then mark the job as failed.
|
|
for _, job := range jobsToReap {
|
|
log := d.log.With(slog.F("job_id", job.ID))
|
|
|
|
err := reapJob(ctx, log, d.db, d.pubsub, job)
|
|
if err != nil {
|
|
if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) {
|
|
log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err))
|
|
}
|
|
continue
|
|
}
|
|
|
|
stats.TerminatedJobIDs = append(stats.TerminatedJobIDs, job.ID)
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobToReap *jobToReap) error {
|
|
var lowestLogID int64
|
|
|
|
err := db.InTx(func(db database.Store) error {
|
|
// Refetch the job while we hold the lock.
|
|
job, err := db.GetProvisionerJobByIDForUpdate(ctx, jobToReap.ID)
|
|
if err != nil {
|
|
if xerrors.Is(err, sql.ErrNoRows) {
|
|
return acquireLockError{}
|
|
}
|
|
return xerrors.Errorf("get provisioner job: %w", err)
|
|
}
|
|
|
|
if job.CompletedAt.Valid {
|
|
return jobIneligibleError{
|
|
Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus),
|
|
}
|
|
}
|
|
if job.UpdatedAt.After(time.Now().Add(-jobToReap.Threshold)) {
|
|
return jobIneligibleError{
|
|
Err: xerrors.New("job has been updated recently"),
|
|
}
|
|
}
|
|
|
|
log.Warn(
|
|
ctx, "forcefully terminating provisioner job",
|
|
"type", jobToReap.Type,
|
|
"threshold", jobToReap.Threshold,
|
|
)
|
|
|
|
// First, get the latest logs from the build so we can make sure
|
|
// our messages are in the latest stage.
|
|
logs, err := db.GetProvisionerLogsAfterID(ctx, database.GetProvisionerLogsAfterIDParams{
|
|
JobID: job.ID,
|
|
CreatedAfter: 0,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("get logs for %s job: %w", jobToReap.Type, err)
|
|
}
|
|
logStage := ""
|
|
if len(logs) != 0 {
|
|
logStage = logs[len(logs)-1].Stage
|
|
}
|
|
if logStage == "" {
|
|
logStage = "Unknown"
|
|
}
|
|
|
|
// Insert the messages into the build log.
|
|
insertParams := database.InsertProvisionerJobLogsParams{
|
|
JobID: job.ID,
|
|
CreatedAt: nil,
|
|
Source: nil,
|
|
Level: nil,
|
|
Stage: nil,
|
|
Output: nil,
|
|
}
|
|
now := dbtime.Now()
|
|
for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) {
|
|
// Set the created at in a way that ensures each message has
|
|
// a unique timestamp so they will be sorted correctly.
|
|
insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i)))
|
|
insertParams.Level = append(insertParams.Level, database.LogLevelError)
|
|
insertParams.Stage = append(insertParams.Stage, logStage)
|
|
insertParams.Source = append(insertParams.Source, database.LogSourceProvisionerDaemon)
|
|
insertParams.Output = append(insertParams.Output, msg)
|
|
}
|
|
newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams)
|
|
if err != nil {
|
|
return xerrors.Errorf("insert logs for %s job: %w", job.JobStatus, err)
|
|
}
|
|
lowestLogID = newLogs[0].ID
|
|
|
|
// Mark the job as failed.
|
|
now = dbtime.Now()
|
|
|
|
// If the job was never started (pending), set the StartedAt time to the current
|
|
// time so that the build duration is correct.
|
|
if job.JobStatus == database.ProvisionerJobStatusPending {
|
|
job.StartedAt = sql.NullTime{
|
|
Time: now,
|
|
Valid: true,
|
|
}
|
|
}
|
|
err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{
|
|
ID: job.ID,
|
|
UpdatedAt: now,
|
|
CompletedAt: sql.NullTime{
|
|
Time: now,
|
|
Valid: true,
|
|
},
|
|
Error: sql.NullString{
|
|
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()),
|
|
Valid: true,
|
|
},
|
|
ErrorCode: sql.NullString{
|
|
Valid: false,
|
|
},
|
|
StartedAt: job.StartedAt,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("mark job as failed: %w", err)
|
|
}
|
|
|
|
// If the provisioner job is a workspace build, copy the
|
|
// provisioner state from the previous build to this workspace
|
|
// build.
|
|
if job.Type == database.ProvisionerJobTypeWorkspaceBuild {
|
|
build, err := db.GetWorkspaceBuildByJobID(ctx, job.ID)
|
|
if err != nil {
|
|
return xerrors.Errorf("get workspace build for workspace build job by job id: %w", err)
|
|
}
|
|
|
|
// Only copy the provisioner state if there's no state in
|
|
// the current build.
|
|
if len(build.ProvisionerState) == 0 {
|
|
// Get the previous build if it exists.
|
|
prevBuild, err := db.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, database.GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams{
|
|
WorkspaceID: build.WorkspaceID,
|
|
BuildNumber: build.BuildNumber - 1,
|
|
})
|
|
if err != nil && !xerrors.Is(err, sql.ErrNoRows) {
|
|
return xerrors.Errorf("get previous workspace build: %w", err)
|
|
}
|
|
if err == nil {
|
|
err = db.UpdateWorkspaceBuildProvisionerStateByID(ctx, database.UpdateWorkspaceBuildProvisionerStateByIDParams{
|
|
ID: build.ID,
|
|
UpdatedAt: dbtime.Now(),
|
|
ProvisionerState: prevBuild.ProvisionerState,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("update workspace build by id: %w", err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}, nil)
|
|
if err != nil {
|
|
return xerrors.Errorf("in tx: %w", err)
|
|
}
|
|
|
|
// Publish the new log notification to pubsub. Use the lowest log ID
|
|
// inserted so the log stream will fetch everything after that point.
|
|
data, err := json.Marshal(provisionersdk.ProvisionerJobLogsNotifyMessage{
|
|
CreatedAfter: lowestLogID - 1,
|
|
EndOfLogs: true,
|
|
})
|
|
if err != nil {
|
|
return xerrors.Errorf("marshal log notification: %w", err)
|
|
}
|
|
err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobToReap.ID), data)
|
|
if err != nil {
|
|
return xerrors.Errorf("publish log notification: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|