mirror of
https://github.com/coder/coder.git
synced 2025-07-09 11:45:56 +00:00
395
coderd/jobreaper/detector.go
Normal file
395
coderd/jobreaper/detector.go
Normal file
@ -0,0 +1,395 @@
|
||||
package jobreaper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt" //#nosec // this is only used for shuffling an array to pick random jobs to unhang
|
||||
"time"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"cdr.dev/slog"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
"github.com/coder/coder/v2/coderd/database/pubsub"
|
||||
"github.com/coder/coder/v2/provisionersdk"
|
||||
)
|
||||
|
||||
const (
|
||||
// HungJobDuration is the duration of time since the last update
|
||||
// to a RUNNING job before it is considered hung.
|
||||
HungJobDuration = 5 * time.Minute
|
||||
|
||||
// PendingJobDuration is the duration of time since last update
|
||||
// to a PENDING job before it is considered dead.
|
||||
PendingJobDuration = 30 * time.Minute
|
||||
|
||||
// HungJobExitTimeout is the duration of time that provisioners should allow
|
||||
// for a graceful exit upon cancellation due to failing to send an update to
|
||||
// a job.
|
||||
//
|
||||
// Provisioners should avoid keeping a job "running" for longer than this
|
||||
// time after failing to send an update to the job.
|
||||
HungJobExitTimeout = 3 * time.Minute
|
||||
|
||||
// MaxJobsPerRun is the maximum number of hung jobs that the detector will
|
||||
// terminate in a single run.
|
||||
MaxJobsPerRun = 10
|
||||
)
|
||||
|
||||
// jobLogMessages are written to provisioner job logs when a job is reaped
|
||||
func JobLogMessages(reapType ReapType, threshold time.Duration) []string {
|
||||
return []string{
|
||||
"",
|
||||
"====================",
|
||||
fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", reapType, threshold.Minutes()),
|
||||
"====================",
|
||||
"",
|
||||
}
|
||||
}
|
||||
|
||||
type jobToReap struct {
|
||||
ID uuid.UUID
|
||||
Threshold time.Duration
|
||||
Type ReapType
|
||||
}
|
||||
|
||||
type ReapType string
|
||||
|
||||
const (
|
||||
Pending ReapType = "pending"
|
||||
Hung ReapType = "hung"
|
||||
)
|
||||
|
||||
// acquireLockError is returned when the detector fails to acquire a lock and
|
||||
// cancels the current run.
|
||||
type acquireLockError struct{}
|
||||
|
||||
// Error implements error.
|
||||
func (acquireLockError) Error() string {
|
||||
return "lock is held by another client"
|
||||
}
|
||||
|
||||
// jobIneligibleError is returned when a job is not eligible to be terminated
|
||||
// anymore.
|
||||
type jobIneligibleError struct {
|
||||
Err error
|
||||
}
|
||||
|
||||
// Error implements error.
|
||||
func (e jobIneligibleError) Error() string {
|
||||
return fmt.Sprintf("job is no longer eligible to be terminated: %s", e.Err)
|
||||
}
|
||||
|
||||
// Detector automatically detects hung provisioner jobs, sends messages into the
|
||||
// build log and terminates them as failed.
|
||||
type Detector struct {
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
done chan struct{}
|
||||
|
||||
db database.Store
|
||||
pubsub pubsub.Pubsub
|
||||
log slog.Logger
|
||||
tick <-chan time.Time
|
||||
stats chan<- Stats
|
||||
}
|
||||
|
||||
// Stats contains statistics about the last run of the detector.
|
||||
type Stats struct {
|
||||
// TerminatedJobIDs contains the IDs of all jobs that were detected as hung and
|
||||
// terminated.
|
||||
TerminatedJobIDs []uuid.UUID
|
||||
// Error is the fatal error that occurred during the last run of the
|
||||
// detector, if any. Error may be set to AcquireLockError if the detector
|
||||
// failed to acquire a lock.
|
||||
Error error
|
||||
}
|
||||
|
||||
// New returns a new job reaper.
|
||||
func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector {
|
||||
//nolint:gocritic // Job reaper has a limited set of permissions.
|
||||
ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx))
|
||||
d := &Detector{
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
done: make(chan struct{}),
|
||||
db: db,
|
||||
pubsub: pub,
|
||||
log: log,
|
||||
tick: tick,
|
||||
stats: nil,
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// WithStatsChannel will cause Executor to push a RunStats to ch after
|
||||
// every tick. This push is blocking, so if ch is not read, the detector will
|
||||
// hang. This should only be used in tests.
|
||||
func (d *Detector) WithStatsChannel(ch chan<- Stats) *Detector {
|
||||
d.stats = ch
|
||||
return d
|
||||
}
|
||||
|
||||
// Start will cause the detector to detect and unhang provisioner jobs on every
|
||||
// tick from its channel. It will stop when its context is Done, or when its
|
||||
// channel is closed.
|
||||
//
|
||||
// Start should only be called once.
|
||||
func (d *Detector) Start() {
|
||||
go func() {
|
||||
defer close(d.done)
|
||||
defer d.cancel()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-d.ctx.Done():
|
||||
return
|
||||
case t, ok := <-d.tick:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
stats := d.run(t)
|
||||
if stats.Error != nil && !xerrors.As(stats.Error, &acquireLockError{}) {
|
||||
d.log.Warn(d.ctx, "error running workspace build hang detector once", slog.Error(stats.Error))
|
||||
}
|
||||
if d.stats != nil {
|
||||
select {
|
||||
case <-d.ctx.Done():
|
||||
return
|
||||
case d.stats <- stats:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Wait will block until the detector is stopped.
|
||||
func (d *Detector) Wait() {
|
||||
<-d.done
|
||||
}
|
||||
|
||||
// Close will stop the detector.
|
||||
func (d *Detector) Close() {
|
||||
d.cancel()
|
||||
<-d.done
|
||||
}
|
||||
|
||||
func (d *Detector) run(t time.Time) Stats {
|
||||
ctx, cancel := context.WithTimeout(d.ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
stats := Stats{
|
||||
TerminatedJobIDs: []uuid.UUID{},
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
// Find all provisioner jobs to be reaped
|
||||
jobs, err := d.db.GetProvisionerJobsToBeReaped(ctx, database.GetProvisionerJobsToBeReapedParams{
|
||||
PendingSince: t.Add(-PendingJobDuration),
|
||||
HungSince: t.Add(-HungJobDuration),
|
||||
MaxJobs: MaxJobsPerRun,
|
||||
})
|
||||
if err != nil {
|
||||
stats.Error = xerrors.Errorf("get provisioner jobs to be reaped: %w", err)
|
||||
return stats
|
||||
}
|
||||
|
||||
jobsToReap := make([]*jobToReap, 0, len(jobs))
|
||||
|
||||
for _, job := range jobs {
|
||||
j := &jobToReap{
|
||||
ID: job.ID,
|
||||
}
|
||||
if job.JobStatus == database.ProvisionerJobStatusPending {
|
||||
j.Threshold = PendingJobDuration
|
||||
j.Type = Pending
|
||||
} else {
|
||||
j.Threshold = HungJobDuration
|
||||
j.Type = Hung
|
||||
}
|
||||
jobsToReap = append(jobsToReap, j)
|
||||
}
|
||||
|
||||
// Send a message into the build log for each hung or pending job saying that it
|
||||
// has been detected and will be terminated, then mark the job as failed.
|
||||
for _, job := range jobsToReap {
|
||||
log := d.log.With(slog.F("job_id", job.ID))
|
||||
|
||||
err := reapJob(ctx, log, d.db, d.pubsub, job)
|
||||
if err != nil {
|
||||
if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) {
|
||||
log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
stats.TerminatedJobIDs = append(stats.TerminatedJobIDs, job.ID)
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobToReap *jobToReap) error {
|
||||
var lowestLogID int64
|
||||
|
||||
err := db.InTx(func(db database.Store) error {
|
||||
// Refetch the job while we hold the lock.
|
||||
job, err := db.GetProvisionerJobByIDForUpdate(ctx, jobToReap.ID)
|
||||
if err != nil {
|
||||
if xerrors.Is(err, sql.ErrNoRows) {
|
||||
return acquireLockError{}
|
||||
}
|
||||
return xerrors.Errorf("get provisioner job: %w", err)
|
||||
}
|
||||
|
||||
if job.CompletedAt.Valid {
|
||||
return jobIneligibleError{
|
||||
Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus),
|
||||
}
|
||||
}
|
||||
if job.UpdatedAt.After(time.Now().Add(-jobToReap.Threshold)) {
|
||||
return jobIneligibleError{
|
||||
Err: xerrors.New("job has been updated recently"),
|
||||
}
|
||||
}
|
||||
|
||||
log.Warn(
|
||||
ctx, "forcefully terminating provisioner job",
|
||||
"type", jobToReap.Type,
|
||||
"threshold", jobToReap.Threshold,
|
||||
)
|
||||
|
||||
// First, get the latest logs from the build so we can make sure
|
||||
// our messages are in the latest stage.
|
||||
logs, err := db.GetProvisionerLogsAfterID(ctx, database.GetProvisionerLogsAfterIDParams{
|
||||
JobID: job.ID,
|
||||
CreatedAfter: 0,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get logs for %s job: %w", jobToReap.Type, err)
|
||||
}
|
||||
logStage := ""
|
||||
if len(logs) != 0 {
|
||||
logStage = logs[len(logs)-1].Stage
|
||||
}
|
||||
if logStage == "" {
|
||||
logStage = "Unknown"
|
||||
}
|
||||
|
||||
// Insert the messages into the build log.
|
||||
insertParams := database.InsertProvisionerJobLogsParams{
|
||||
JobID: job.ID,
|
||||
CreatedAt: nil,
|
||||
Source: nil,
|
||||
Level: nil,
|
||||
Stage: nil,
|
||||
Output: nil,
|
||||
}
|
||||
now := dbtime.Now()
|
||||
for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) {
|
||||
// Set the created at in a way that ensures each message has
|
||||
// a unique timestamp so they will be sorted correctly.
|
||||
insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i)))
|
||||
insertParams.Level = append(insertParams.Level, database.LogLevelError)
|
||||
insertParams.Stage = append(insertParams.Stage, logStage)
|
||||
insertParams.Source = append(insertParams.Source, database.LogSourceProvisionerDaemon)
|
||||
insertParams.Output = append(insertParams.Output, msg)
|
||||
}
|
||||
newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("insert logs for %s job: %w", job.JobStatus, err)
|
||||
}
|
||||
lowestLogID = newLogs[0].ID
|
||||
|
||||
// Mark the job as failed.
|
||||
now = dbtime.Now()
|
||||
|
||||
// If the job was never started (pending), set the StartedAt time to the current
|
||||
// time so that the build duration is correct.
|
||||
if job.JobStatus == database.ProvisionerJobStatusPending {
|
||||
job.StartedAt = sql.NullTime{
|
||||
Time: now,
|
||||
Valid: true,
|
||||
}
|
||||
}
|
||||
err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{
|
||||
ID: job.ID,
|
||||
UpdatedAt: now,
|
||||
CompletedAt: sql.NullTime{
|
||||
Time: now,
|
||||
Valid: true,
|
||||
},
|
||||
Error: sql.NullString{
|
||||
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()),
|
||||
Valid: true,
|
||||
},
|
||||
ErrorCode: sql.NullString{
|
||||
Valid: false,
|
||||
},
|
||||
StartedAt: job.StartedAt,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("mark job as failed: %w", err)
|
||||
}
|
||||
|
||||
// If the provisioner job is a workspace build, copy the
|
||||
// provisioner state from the previous build to this workspace
|
||||
// build.
|
||||
if job.Type == database.ProvisionerJobTypeWorkspaceBuild {
|
||||
build, err := db.GetWorkspaceBuildByJobID(ctx, job.ID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get workspace build for workspace build job by job id: %w", err)
|
||||
}
|
||||
|
||||
// Only copy the provisioner state if there's no state in
|
||||
// the current build.
|
||||
if len(build.ProvisionerState) == 0 {
|
||||
// Get the previous build if it exists.
|
||||
prevBuild, err := db.GetWorkspaceBuildByWorkspaceIDAndBuildNumber(ctx, database.GetWorkspaceBuildByWorkspaceIDAndBuildNumberParams{
|
||||
WorkspaceID: build.WorkspaceID,
|
||||
BuildNumber: build.BuildNumber - 1,
|
||||
})
|
||||
if err != nil && !xerrors.Is(err, sql.ErrNoRows) {
|
||||
return xerrors.Errorf("get previous workspace build: %w", err)
|
||||
}
|
||||
if err == nil {
|
||||
err = db.UpdateWorkspaceBuildProvisionerStateByID(ctx, database.UpdateWorkspaceBuildProvisionerStateByIDParams{
|
||||
ID: build.ID,
|
||||
UpdatedAt: dbtime.Now(),
|
||||
ProvisionerState: prevBuild.ProvisionerState,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("update workspace build by id: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}, nil)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("in tx: %w", err)
|
||||
}
|
||||
|
||||
// Publish the new log notification to pubsub. Use the lowest log ID
|
||||
// inserted so the log stream will fetch everything after that point.
|
||||
data, err := json.Marshal(provisionersdk.ProvisionerJobLogsNotifyMessage{
|
||||
CreatedAfter: lowestLogID - 1,
|
||||
EndOfLogs: true,
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("marshal log notification: %w", err)
|
||||
}
|
||||
err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobToReap.ID), data)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("publish log notification: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
Reference in New Issue
Block a user