mirror of
https://github.com/coder/coder.git
synced 2025-07-15 22:20:27 +00:00
feat: add provisioner job hang detector (#7927)
This commit is contained in:
@ -18,13 +18,6 @@ import (
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
// Well-known lock IDs for lock functions in the database. These should not
|
||||
// change. If locks are deprecated, they should be kept to avoid reusing the
|
||||
// same ID.
|
||||
const (
|
||||
LockIDDeploymentSetup = iota + 1
|
||||
)
|
||||
|
||||
// Store contains all queryable database functions.
|
||||
// It extends the generated interface to add transaction support.
|
||||
type Store interface {
|
||||
|
@ -3,7 +3,6 @@ package db2sdk
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
@ -81,6 +80,9 @@ func TemplateVersionParameter(param database.TemplateVersionParameter) (codersdk
|
||||
}
|
||||
|
||||
func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.ProvisionerJobStatus {
|
||||
// The case where jobs are hung is handled by the unhang package. We can't
|
||||
// just return Failed here when it's hung because that doesn't reflect in
|
||||
// the database.
|
||||
switch {
|
||||
case provisionerJob.CanceledAt.Valid:
|
||||
if !provisionerJob.CompletedAt.Valid {
|
||||
@ -97,8 +99,6 @@ func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.Provi
|
||||
return codersdk.ProvisionerJobSucceeded
|
||||
}
|
||||
return codersdk.ProvisionerJobFailed
|
||||
case database.Now().Sub(provisionerJob.UpdatedAt) > 30*time.Second:
|
||||
return codersdk.ProvisionerJobFailed
|
||||
default:
|
||||
return codersdk.ProvisionerJobRunning
|
||||
}
|
||||
|
@ -96,17 +96,6 @@ func TestProvisionerJobStatus(t *testing.T) {
|
||||
},
|
||||
status: codersdk.ProvisionerJobFailed,
|
||||
},
|
||||
{
|
||||
name: "not_updated",
|
||||
job: database.ProvisionerJob{
|
||||
StartedAt: sql.NullTime{
|
||||
Time: database.Now().Add(-time.Minute),
|
||||
Valid: true,
|
||||
},
|
||||
UpdatedAt: database.Now().Add(-31 * time.Second),
|
||||
},
|
||||
status: codersdk.ProvisionerJobFailed,
|
||||
},
|
||||
{
|
||||
name: "updated",
|
||||
job: database.ProvisionerJob{
|
||||
|
@ -176,6 +176,25 @@ var (
|
||||
Scope: rbac.ScopeAll,
|
||||
}.WithCachedASTValue()
|
||||
|
||||
// See unhanger package.
|
||||
subjectHangDetector = rbac.Subject{
|
||||
ID: uuid.Nil.String(),
|
||||
Roles: rbac.Roles([]rbac.Role{
|
||||
{
|
||||
Name: "hangdetector",
|
||||
DisplayName: "Hang Detector Daemon",
|
||||
Site: rbac.Permissions(map[string][]rbac.Action{
|
||||
rbac.ResourceSystem.Type: {rbac.WildcardSymbol},
|
||||
rbac.ResourceTemplate.Type: {rbac.ActionRead},
|
||||
rbac.ResourceWorkspace.Type: {rbac.ActionRead, rbac.ActionUpdate},
|
||||
}),
|
||||
Org: map[string][]rbac.Permission{},
|
||||
User: []rbac.Permission{},
|
||||
},
|
||||
}),
|
||||
Scope: rbac.ScopeAll,
|
||||
}.WithCachedASTValue()
|
||||
|
||||
subjectSystemRestricted = rbac.Subject{
|
||||
ID: uuid.Nil.String(),
|
||||
Roles: rbac.Roles([]rbac.Role{
|
||||
@ -217,6 +236,12 @@ func AsAutostart(ctx context.Context) context.Context {
|
||||
return context.WithValue(ctx, authContextKey{}, subjectAutostart)
|
||||
}
|
||||
|
||||
// AsHangDetector returns a context with an actor that has permissions required
|
||||
// for unhanger.Detector to function.
|
||||
func AsHangDetector(ctx context.Context) context.Context {
|
||||
return context.WithValue(ctx, authContextKey{}, subjectHangDetector)
|
||||
}
|
||||
|
||||
// AsSystemRestricted returns a context with an actor that has permissions
|
||||
// required for various system operations (login, logout, metrics cache).
|
||||
func AsSystemRestricted(ctx context.Context) context.Context {
|
||||
@ -950,6 +975,14 @@ func (q *querier) GetGroupsByOrganizationID(ctx context.Context, organizationID
|
||||
return fetchWithPostFilter(q.auth, q.db.GetGroupsByOrganizationID)(ctx, organizationID)
|
||||
}
|
||||
|
||||
// TODO: We need to create a ProvisionerJob resource type
|
||||
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
|
||||
// if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
return q.db.GetHungProvisionerJobs(ctx, hungSince)
|
||||
}
|
||||
|
||||
func (q *querier) GetLastUpdateCheck(ctx context.Context) (string, error) {
|
||||
if err := q.authorizeContext(ctx, rbac.ActionRead, rbac.ResourceSystem); err != nil {
|
||||
return "", err
|
||||
|
@ -1753,6 +1753,19 @@ func (q *fakeQuerier) GetGroupsByOrganizationID(_ context.Context, organizationI
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func (q *fakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
|
||||
q.mutex.RLock()
|
||||
defer q.mutex.RUnlock()
|
||||
|
||||
hungJobs := []database.ProvisionerJob{}
|
||||
for _, provisionerJob := range q.provisionerJobs {
|
||||
if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) {
|
||||
hungJobs = append(hungJobs, provisionerJob)
|
||||
}
|
||||
}
|
||||
return hungJobs, nil
|
||||
}
|
||||
|
||||
func (q *fakeQuerier) GetLastUpdateCheck(_ context.Context) (string, error) {
|
||||
q.mutex.RLock()
|
||||
defer q.mutex.RUnlock()
|
||||
@ -2135,7 +2148,7 @@ func (q *fakeQuerier) GetProvisionerLogsAfterID(_ context.Context, arg database.
|
||||
if jobLog.JobID != arg.JobID {
|
||||
continue
|
||||
}
|
||||
if arg.CreatedAfter != 0 && jobLog.ID < arg.CreatedAfter {
|
||||
if jobLog.ID <= arg.CreatedAfter {
|
||||
continue
|
||||
}
|
||||
logs = append(logs, jobLog)
|
||||
|
@ -399,6 +399,13 @@ func (m metricsStore) GetGroupsByOrganizationID(ctx context.Context, organizatio
|
||||
return groups, err
|
||||
}
|
||||
|
||||
func (m metricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
|
||||
start := time.Now()
|
||||
jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince)
|
||||
m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds())
|
||||
return jobs, err
|
||||
}
|
||||
|
||||
func (m metricsStore) GetLastUpdateCheck(ctx context.Context) (string, error) {
|
||||
start := time.Now()
|
||||
version, err := m.s.GetLastUpdateCheck(ctx)
|
||||
|
@ -701,6 +701,21 @@ func (mr *MockStoreMockRecorder) GetGroupsByOrganizationID(arg0, arg1 interface{
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetGroupsByOrganizationID", reflect.TypeOf((*MockStore)(nil).GetGroupsByOrganizationID), arg0, arg1)
|
||||
}
|
||||
|
||||
// GetHungProvisionerJobs mocks base method.
|
||||
func (m *MockStore) GetHungProvisionerJobs(arg0 context.Context, arg1 time.Time) ([]database.ProvisionerJob, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "GetHungProvisionerJobs", arg0, arg1)
|
||||
ret0, _ := ret[0].([]database.ProvisionerJob)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// GetHungProvisionerJobs indicates an expected call of GetHungProvisionerJobs.
|
||||
func (mr *MockStoreMockRecorder) GetHungProvisionerJobs(arg0, arg1 interface{}) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHungProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetHungProvisionerJobs), arg0, arg1)
|
||||
}
|
||||
|
||||
// GetLastUpdateCheck mocks base method.
|
||||
func (m *MockStore) GetLastUpdateCheck(arg0 context.Context) (string, error) {
|
||||
m.ctrl.T.Helper()
|
||||
|
19
coderd/database/lock.go
Normal file
19
coderd/database/lock.go
Normal file
@ -0,0 +1,19 @@
|
||||
package database
|
||||
|
||||
import "hash/fnv"
|
||||
|
||||
// Well-known lock IDs for lock functions in the database. These should not
|
||||
// change. If locks are deprecated, they should be kept in this list to avoid
|
||||
// reusing the same ID.
|
||||
const (
|
||||
// Keep the unused iota here so we don't need + 1 every time
|
||||
lockIDUnused = iota
|
||||
LockIDDeploymentSetup
|
||||
)
|
||||
|
||||
// GenLockID generates a unique and consistent lock ID from a given string.
|
||||
func GenLockID(name string) int64 {
|
||||
hash := fnv.New64()
|
||||
_, _ = hash.Write([]byte(name))
|
||||
return int64(hash.Sum64())
|
||||
}
|
@ -16,8 +16,6 @@ type sqlcQuerier interface {
|
||||
//
|
||||
// This must be called from within a transaction. The lock will be automatically
|
||||
// released when the transaction ends.
|
||||
//
|
||||
// Use database.LockID() to generate a unique lock ID from a string.
|
||||
AcquireLock(ctx context.Context, pgAdvisoryXactLock int64) error
|
||||
// Acquires the lock for a single job that isn't started, completed,
|
||||
// canceled, and that matches an array of provisioner types.
|
||||
@ -75,6 +73,7 @@ type sqlcQuerier interface {
|
||||
GetGroupByOrgAndName(ctx context.Context, arg GetGroupByOrgAndNameParams) (Group, error)
|
||||
GetGroupMembers(ctx context.Context, groupID uuid.UUID) ([]User, error)
|
||||
GetGroupsByOrganizationID(ctx context.Context, organizationID uuid.UUID) ([]Group, error)
|
||||
GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error)
|
||||
GetLastUpdateCheck(ctx context.Context) (string, error)
|
||||
GetLatestWorkspaceBuildByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) (WorkspaceBuild, error)
|
||||
GetLatestWorkspaceBuilds(ctx context.Context) ([]WorkspaceBuild, error)
|
||||
@ -217,8 +216,6 @@ type sqlcQuerier interface {
|
||||
//
|
||||
// This must be called from within a transaction. The lock will be automatically
|
||||
// released when the transaction ends.
|
||||
//
|
||||
// Use database.LockID() to generate a unique lock ID from a string.
|
||||
TryAcquireLock(ctx context.Context, pgTryAdvisoryXactLock int64) (bool, error)
|
||||
UpdateAPIKeyByID(ctx context.Context, arg UpdateAPIKeyByIDParams) error
|
||||
UpdateGitAuthLink(ctx context.Context, arg UpdateGitAuthLinkParams) (GitAuthLink, error)
|
||||
|
@ -1527,8 +1527,6 @@ SELECT pg_advisory_xact_lock($1)
|
||||
//
|
||||
// This must be called from within a transaction. The lock will be automatically
|
||||
// released when the transaction ends.
|
||||
//
|
||||
// Use database.LockID() to generate a unique lock ID from a string.
|
||||
func (q *sqlQuerier) AcquireLock(ctx context.Context, pgAdvisoryXactLock int64) error {
|
||||
_, err := q.db.ExecContext(ctx, acquireLock, pgAdvisoryXactLock)
|
||||
return err
|
||||
@ -1542,8 +1540,6 @@ SELECT pg_try_advisory_xact_lock($1)
|
||||
//
|
||||
// This must be called from within a transaction. The lock will be automatically
|
||||
// released when the transaction ends.
|
||||
//
|
||||
// Use database.LockID() to generate a unique lock ID from a string.
|
||||
func (q *sqlQuerier) TryAcquireLock(ctx context.Context, pgTryAdvisoryXactLock int64) (bool, error) {
|
||||
row := q.db.QueryRowContext(ctx, tryAcquireLock, pgTryAdvisoryXactLock)
|
||||
var pg_try_advisory_xact_lock bool
|
||||
@ -2201,6 +2197,59 @@ func (q *sqlQuerier) AcquireProvisionerJob(ctx context.Context, arg AcquireProvi
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getHungProvisionerJobs = `-- name: GetHungProvisionerJobs :many
|
||||
SELECT
|
||||
id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata
|
||||
FROM
|
||||
provisioner_jobs
|
||||
WHERE
|
||||
updated_at < $1
|
||||
AND started_at IS NOT NULL
|
||||
AND completed_at IS NULL
|
||||
`
|
||||
|
||||
func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) {
|
||||
rows, err := q.db.QueryContext(ctx, getHungProvisionerJobs, updatedAt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []ProvisionerJob
|
||||
for rows.Next() {
|
||||
var i ProvisionerJob
|
||||
if err := rows.Scan(
|
||||
&i.ID,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
&i.StartedAt,
|
||||
&i.CanceledAt,
|
||||
&i.CompletedAt,
|
||||
&i.Error,
|
||||
&i.OrganizationID,
|
||||
&i.InitiatorID,
|
||||
&i.Provisioner,
|
||||
&i.StorageMethod,
|
||||
&i.Type,
|
||||
&i.Input,
|
||||
&i.WorkerID,
|
||||
&i.FileID,
|
||||
&i.Tags,
|
||||
&i.ErrorCode,
|
||||
&i.TraceMetadata,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one
|
||||
SELECT
|
||||
id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata
|
||||
|
@ -3,8 +3,6 @@
|
||||
--
|
||||
-- This must be called from within a transaction. The lock will be automatically
|
||||
-- released when the transaction ends.
|
||||
--
|
||||
-- Use database.LockID() to generate a unique lock ID from a string.
|
||||
SELECT pg_advisory_xact_lock($1);
|
||||
|
||||
-- name: TryAcquireLock :one
|
||||
@ -12,6 +10,4 @@ SELECT pg_advisory_xact_lock($1);
|
||||
--
|
||||
-- This must be called from within a transaction. The lock will be automatically
|
||||
-- released when the transaction ends.
|
||||
--
|
||||
-- Use database.LockID() to generate a unique lock ID from a string.
|
||||
SELECT pg_try_advisory_xact_lock($1);
|
||||
|
@ -128,3 +128,13 @@ SET
|
||||
error_code = $5
|
||||
WHERE
|
||||
id = $1;
|
||||
|
||||
-- name: GetHungProvisionerJobs :many
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
provisioner_jobs
|
||||
WHERE
|
||||
updated_at < $1
|
||||
AND started_at IS NOT NULL
|
||||
AND completed_at IS NULL;
|
||||
|
Reference in New Issue
Block a user