feat: add provisioner job hang detector (#7927)

This commit is contained in:
Dean Sheather
2023-06-25 23:17:00 +10:00
committed by GitHub
parent 3671846b1b
commit 98a5ae7f48
28 changed files with 1414 additions and 54 deletions

View File

@ -18,13 +18,6 @@ import (
"golang.org/x/xerrors"
)
// Well-known lock IDs for lock functions in the database. These should not
// change. If locks are deprecated, they should be kept to avoid reusing the
// same ID.
const (
LockIDDeploymentSetup = iota + 1
)
// Store contains all queryable database functions.
// It extends the generated interface to add transaction support.
type Store interface {

View File

@ -3,7 +3,6 @@ package db2sdk
import (
"encoding/json"
"time"
"github.com/google/uuid"
@ -81,6 +80,9 @@ func TemplateVersionParameter(param database.TemplateVersionParameter) (codersdk
}
func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.ProvisionerJobStatus {
// The case where jobs are hung is handled by the unhang package. We can't
// just return Failed here when it's hung because that doesn't reflect in
// the database.
switch {
case provisionerJob.CanceledAt.Valid:
if !provisionerJob.CompletedAt.Valid {
@ -97,8 +99,6 @@ func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.Provi
return codersdk.ProvisionerJobSucceeded
}
return codersdk.ProvisionerJobFailed
case database.Now().Sub(provisionerJob.UpdatedAt) > 30*time.Second:
return codersdk.ProvisionerJobFailed
default:
return codersdk.ProvisionerJobRunning
}

View File

@ -96,17 +96,6 @@ func TestProvisionerJobStatus(t *testing.T) {
},
status: codersdk.ProvisionerJobFailed,
},
{
name: "not_updated",
job: database.ProvisionerJob{
StartedAt: sql.NullTime{
Time: database.Now().Add(-time.Minute),
Valid: true,
},
UpdatedAt: database.Now().Add(-31 * time.Second),
},
status: codersdk.ProvisionerJobFailed,
},
{
name: "updated",
job: database.ProvisionerJob{

View File

@ -176,6 +176,25 @@ var (
Scope: rbac.ScopeAll,
}.WithCachedASTValue()
// See unhanger package.
subjectHangDetector = rbac.Subject{
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
{
Name: "hangdetector",
DisplayName: "Hang Detector Daemon",
Site: rbac.Permissions(map[string][]rbac.Action{
rbac.ResourceSystem.Type: {rbac.WildcardSymbol},
rbac.ResourceTemplate.Type: {rbac.ActionRead},
rbac.ResourceWorkspace.Type: {rbac.ActionRead, rbac.ActionUpdate},
}),
Org: map[string][]rbac.Permission{},
User: []rbac.Permission{},
},
}),
Scope: rbac.ScopeAll,
}.WithCachedASTValue()
subjectSystemRestricted = rbac.Subject{
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
@ -217,6 +236,12 @@ func AsAutostart(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectAutostart)
}
// AsHangDetector returns a context with an actor that has permissions required
// for unhanger.Detector to function.
func AsHangDetector(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectHangDetector)
}
// AsSystemRestricted returns a context with an actor that has permissions
// required for various system operations (login, logout, metrics cache).
func AsSystemRestricted(ctx context.Context) context.Context {
@ -950,6 +975,14 @@ func (q *querier) GetGroupsByOrganizationID(ctx context.Context, organizationID
return fetchWithPostFilter(q.auth, q.db.GetGroupsByOrganizationID)(ctx, organizationID)
}
// TODO: We need to create a ProvisionerJob resource type
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
// if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil {
// return nil, err
// }
return q.db.GetHungProvisionerJobs(ctx, hungSince)
}
func (q *querier) GetLastUpdateCheck(ctx context.Context) (string, error) {
if err := q.authorizeContext(ctx, rbac.ActionRead, rbac.ResourceSystem); err != nil {
return "", err

View File

@ -1753,6 +1753,19 @@ func (q *fakeQuerier) GetGroupsByOrganizationID(_ context.Context, organizationI
return groups, nil
}
func (q *fakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
hungJobs := []database.ProvisionerJob{}
for _, provisionerJob := range q.provisionerJobs {
if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) {
hungJobs = append(hungJobs, provisionerJob)
}
}
return hungJobs, nil
}
func (q *fakeQuerier) GetLastUpdateCheck(_ context.Context) (string, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
@ -2135,7 +2148,7 @@ func (q *fakeQuerier) GetProvisionerLogsAfterID(_ context.Context, arg database.
if jobLog.JobID != arg.JobID {
continue
}
if arg.CreatedAfter != 0 && jobLog.ID < arg.CreatedAfter {
if jobLog.ID <= arg.CreatedAfter {
continue
}
logs = append(logs, jobLog)

View File

@ -399,6 +399,13 @@ func (m metricsStore) GetGroupsByOrganizationID(ctx context.Context, organizatio
return groups, err
}
func (m metricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
start := time.Now()
jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince)
m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds())
return jobs, err
}
func (m metricsStore) GetLastUpdateCheck(ctx context.Context) (string, error) {
start := time.Now()
version, err := m.s.GetLastUpdateCheck(ctx)

View File

@ -701,6 +701,21 @@ func (mr *MockStoreMockRecorder) GetGroupsByOrganizationID(arg0, arg1 interface{
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetGroupsByOrganizationID", reflect.TypeOf((*MockStore)(nil).GetGroupsByOrganizationID), arg0, arg1)
}
// GetHungProvisionerJobs mocks base method.
func (m *MockStore) GetHungProvisionerJobs(arg0 context.Context, arg1 time.Time) ([]database.ProvisionerJob, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "GetHungProvisionerJobs", arg0, arg1)
ret0, _ := ret[0].([]database.ProvisionerJob)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// GetHungProvisionerJobs indicates an expected call of GetHungProvisionerJobs.
func (mr *MockStoreMockRecorder) GetHungProvisionerJobs(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHungProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetHungProvisionerJobs), arg0, arg1)
}
// GetLastUpdateCheck mocks base method.
func (m *MockStore) GetLastUpdateCheck(arg0 context.Context) (string, error) {
m.ctrl.T.Helper()

19
coderd/database/lock.go Normal file
View File

@ -0,0 +1,19 @@
package database
import "hash/fnv"
// Well-known lock IDs for lock functions in the database. These should not
// change. If locks are deprecated, they should be kept in this list to avoid
// reusing the same ID.
const (
// Keep the unused iota here so we don't need + 1 every time
lockIDUnused = iota
LockIDDeploymentSetup
)
// GenLockID generates a unique and consistent lock ID from a given string.
func GenLockID(name string) int64 {
hash := fnv.New64()
_, _ = hash.Write([]byte(name))
return int64(hash.Sum64())
}

View File

@ -16,8 +16,6 @@ type sqlcQuerier interface {
//
// This must be called from within a transaction. The lock will be automatically
// released when the transaction ends.
//
// Use database.LockID() to generate a unique lock ID from a string.
AcquireLock(ctx context.Context, pgAdvisoryXactLock int64) error
// Acquires the lock for a single job that isn't started, completed,
// canceled, and that matches an array of provisioner types.
@ -75,6 +73,7 @@ type sqlcQuerier interface {
GetGroupByOrgAndName(ctx context.Context, arg GetGroupByOrgAndNameParams) (Group, error)
GetGroupMembers(ctx context.Context, groupID uuid.UUID) ([]User, error)
GetGroupsByOrganizationID(ctx context.Context, organizationID uuid.UUID) ([]Group, error)
GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error)
GetLastUpdateCheck(ctx context.Context) (string, error)
GetLatestWorkspaceBuildByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) (WorkspaceBuild, error)
GetLatestWorkspaceBuilds(ctx context.Context) ([]WorkspaceBuild, error)
@ -217,8 +216,6 @@ type sqlcQuerier interface {
//
// This must be called from within a transaction. The lock will be automatically
// released when the transaction ends.
//
// Use database.LockID() to generate a unique lock ID from a string.
TryAcquireLock(ctx context.Context, pgTryAdvisoryXactLock int64) (bool, error)
UpdateAPIKeyByID(ctx context.Context, arg UpdateAPIKeyByIDParams) error
UpdateGitAuthLink(ctx context.Context, arg UpdateGitAuthLinkParams) (GitAuthLink, error)

View File

@ -1527,8 +1527,6 @@ SELECT pg_advisory_xact_lock($1)
//
// This must be called from within a transaction. The lock will be automatically
// released when the transaction ends.
//
// Use database.LockID() to generate a unique lock ID from a string.
func (q *sqlQuerier) AcquireLock(ctx context.Context, pgAdvisoryXactLock int64) error {
_, err := q.db.ExecContext(ctx, acquireLock, pgAdvisoryXactLock)
return err
@ -1542,8 +1540,6 @@ SELECT pg_try_advisory_xact_lock($1)
//
// This must be called from within a transaction. The lock will be automatically
// released when the transaction ends.
//
// Use database.LockID() to generate a unique lock ID from a string.
func (q *sqlQuerier) TryAcquireLock(ctx context.Context, pgTryAdvisoryXactLock int64) (bool, error) {
row := q.db.QueryRowContext(ctx, tryAcquireLock, pgTryAdvisoryXactLock)
var pg_try_advisory_xact_lock bool
@ -2201,6 +2197,59 @@ func (q *sqlQuerier) AcquireProvisionerJob(ctx context.Context, arg AcquireProvi
return i, err
}
const getHungProvisionerJobs = `-- name: GetHungProvisionerJobs :many
SELECT
id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata
FROM
provisioner_jobs
WHERE
updated_at < $1
AND started_at IS NOT NULL
AND completed_at IS NULL
`
func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) {
rows, err := q.db.QueryContext(ctx, getHungProvisionerJobs, updatedAt)
if err != nil {
return nil, err
}
defer rows.Close()
var items []ProvisionerJob
for rows.Next() {
var i ProvisionerJob
if err := rows.Scan(
&i.ID,
&i.CreatedAt,
&i.UpdatedAt,
&i.StartedAt,
&i.CanceledAt,
&i.CompletedAt,
&i.Error,
&i.OrganizationID,
&i.InitiatorID,
&i.Provisioner,
&i.StorageMethod,
&i.Type,
&i.Input,
&i.WorkerID,
&i.FileID,
&i.Tags,
&i.ErrorCode,
&i.TraceMetadata,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Close(); err != nil {
return nil, err
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one
SELECT
id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata

View File

@ -3,8 +3,6 @@
--
-- This must be called from within a transaction. The lock will be automatically
-- released when the transaction ends.
--
-- Use database.LockID() to generate a unique lock ID from a string.
SELECT pg_advisory_xact_lock($1);
-- name: TryAcquireLock :one
@ -12,6 +10,4 @@ SELECT pg_advisory_xact_lock($1);
--
-- This must be called from within a transaction. The lock will be automatically
-- released when the transaction ends.
--
-- Use database.LockID() to generate a unique lock ID from a string.
SELECT pg_try_advisory_xact_lock($1);

View File

@ -128,3 +128,13 @@ SET
error_code = $5
WHERE
id = $1;
-- name: GetHungProvisionerJobs :many
SELECT
*
FROM
provisioner_jobs
WHERE
updated_at < $1
AND started_at IS NOT NULL
AND completed_at IS NULL;