mirror of
https://github.com/coder/coder.git
synced 2025-07-12 00:14:10 +00:00
feat: add prebuilds metrics collector (#17547)
Closes https://github.com/coder/internal/issues/509 --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com>
This commit is contained in:
@ -5,6 +5,8 @@ import (
|
|||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
)
|
)
|
||||||
|
|
||||||
var ErrNoClaimablePrebuiltWorkspaces = xerrors.New("no claimable prebuilt workspaces found")
|
var ErrNoClaimablePrebuiltWorkspaces = xerrors.New("no claimable prebuilt workspaces found")
|
||||||
@ -25,12 +27,23 @@ type ReconciliationOrchestrator interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Reconciler interface {
|
type Reconciler interface {
|
||||||
|
StateSnapshotter
|
||||||
|
|
||||||
// ReconcileAll orchestrates the reconciliation of all prebuilds across all templates.
|
// ReconcileAll orchestrates the reconciliation of all prebuilds across all templates.
|
||||||
// It takes a global snapshot of the system state and then reconciles each preset
|
// It takes a global snapshot of the system state and then reconciles each preset
|
||||||
// in parallel, creating or deleting prebuilds as needed to reach their desired states.
|
// in parallel, creating or deleting prebuilds as needed to reach their desired states.
|
||||||
ReconcileAll(ctx context.Context) error
|
ReconcileAll(ctx context.Context) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StateSnapshotter defines the operations necessary to capture workspace prebuilds state.
|
||||||
|
type StateSnapshotter interface {
|
||||||
|
// SnapshotState captures the current state of all prebuilds across templates.
|
||||||
|
// It creates a global database snapshot that can be viewed as a collection of PresetSnapshots,
|
||||||
|
// each representing the state of prebuilds for a specific preset.
|
||||||
|
// MUST be called inside a repeatable-read transaction.
|
||||||
|
SnapshotState(ctx context.Context, store database.Store) (*GlobalSnapshot, error)
|
||||||
|
}
|
||||||
|
|
||||||
type Claimer interface {
|
type Claimer interface {
|
||||||
Claim(ctx context.Context, userID uuid.UUID, name string, presetID uuid.UUID) (*uuid.UUID, error)
|
Claim(ctx context.Context, userID uuid.UUID, name string, presetID uuid.UUID) (*uuid.UUID, error)
|
||||||
Initiator() uuid.UUID
|
Initiator() uuid.UUID
|
||||||
|
@ -1165,6 +1165,6 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio
|
|||||||
}
|
}
|
||||||
|
|
||||||
reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds,
|
reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds,
|
||||||
api.Logger.Named("prebuilds"), quartz.NewReal())
|
api.Logger.Named("prebuilds"), quartz.NewReal(), api.PrometheusRegistry)
|
||||||
return reconciler, prebuilds.EnterpriseClaimer{}
|
return reconciler, prebuilds.EnterpriseClaimer{}
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"golang.org/x/xerrors"
|
"golang.org/x/xerrors"
|
||||||
|
|
||||||
@ -142,7 +143,7 @@ func TestClaimPrebuild(t *testing.T) {
|
|||||||
EntitlementsUpdateInterval: time.Second,
|
EntitlementsUpdateInterval: time.Second,
|
||||||
})
|
})
|
||||||
|
|
||||||
reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t))
|
reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy)
|
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy)
|
||||||
api.AGPL.PrebuildsClaimer.Store(&claimer)
|
api.AGPL.PrebuildsClaimer.Store(&claimer)
|
||||||
|
|
||||||
@ -419,7 +420,7 @@ func TestClaimPrebuild_CheckDifferentErrors(t *testing.T) {
|
|||||||
EntitlementsUpdateInterval: time.Second,
|
EntitlementsUpdateInterval: time.Second,
|
||||||
})
|
})
|
||||||
|
|
||||||
reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t))
|
reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), api.PrometheusRegistry)
|
||||||
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(errorStore)
|
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(errorStore)
|
||||||
api.AGPL.PrebuildsClaimer.Store(&claimer)
|
api.AGPL.PrebuildsClaimer.Store(&claimer)
|
||||||
|
|
||||||
|
123
enterprise/coderd/prebuilds/metricscollector.go
Normal file
123
enterprise/coderd/prebuilds/metricscollector.go
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
package prebuilds
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"cdr.dev/slog"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||||
|
"github.com/coder/coder/v2/coderd/prebuilds"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
labels = []string{"template_name", "preset_name", "organization_name"}
|
||||||
|
createdPrebuildsDesc = prometheus.NewDesc(
|
||||||
|
"coderd_prebuilt_workspaces_created_total",
|
||||||
|
"Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+
|
||||||
|
"template preset.",
|
||||||
|
labels,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
failedPrebuildsDesc = prometheus.NewDesc(
|
||||||
|
"coderd_prebuilt_workspaces_failed_total",
|
||||||
|
"Total number of prebuilt workspaces that failed to build.",
|
||||||
|
labels,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
claimedPrebuildsDesc = prometheus.NewDesc(
|
||||||
|
"coderd_prebuilt_workspaces_claimed_total",
|
||||||
|
"Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+
|
||||||
|
"with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.",
|
||||||
|
labels,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
desiredPrebuildsDesc = prometheus.NewDesc(
|
||||||
|
"coderd_prebuilt_workspaces_desired",
|
||||||
|
"Target number of prebuilt workspaces that should be available for each template preset.",
|
||||||
|
labels,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
runningPrebuildsDesc = prometheus.NewDesc(
|
||||||
|
"coderd_prebuilt_workspaces_running",
|
||||||
|
"Current number of prebuilt workspaces that are in a running state. These workspaces have started "+
|
||||||
|
"successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).",
|
||||||
|
labels,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
eligiblePrebuildsDesc = prometheus.NewDesc(
|
||||||
|
"coderd_prebuilt_workspaces_eligible",
|
||||||
|
"Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+
|
||||||
|
"have completed their build process with their agent reporting 'ready' status.",
|
||||||
|
labels,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
type MetricsCollector struct {
|
||||||
|
database database.Store
|
||||||
|
logger slog.Logger
|
||||||
|
snapshotter prebuilds.StateSnapshotter
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ prometheus.Collector = new(MetricsCollector)
|
||||||
|
|
||||||
|
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
|
||||||
|
return &MetricsCollector{
|
||||||
|
database: db,
|
||||||
|
logger: logger.Named("prebuilds_metrics_collector"),
|
||||||
|
snapshotter: snapshotter,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
|
||||||
|
descCh <- createdPrebuildsDesc
|
||||||
|
descCh <- failedPrebuildsDesc
|
||||||
|
descCh <- claimedPrebuildsDesc
|
||||||
|
descCh <- desiredPrebuildsDesc
|
||||||
|
descCh <- runningPrebuildsDesc
|
||||||
|
descCh <- eligiblePrebuildsDesc
|
||||||
|
}
|
||||||
|
|
||||||
|
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
|
||||||
|
// nolint:gocritic // We need to set an authz context to read metrics from the db.
|
||||||
|
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx)
|
||||||
|
if err != nil {
|
||||||
|
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, metric := range prebuildMetrics {
|
||||||
|
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
|
||||||
|
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
|
||||||
|
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database)
|
||||||
|
if err != nil {
|
||||||
|
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, preset := range snapshot.Presets {
|
||||||
|
if !preset.UsingActiveVersion {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
presetSnapshot, err := snapshot.FilterByPreset(preset.ID)
|
||||||
|
if err != nil {
|
||||||
|
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
state := presetSnapshot.CalculateState()
|
||||||
|
|
||||||
|
metricsCh <- prometheus.MustNewConstMetric(desiredPrebuildsDesc, prometheus.GaugeValue, float64(state.Desired), preset.TemplateName, preset.Name, preset.OrganizationName)
|
||||||
|
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
|
||||||
|
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
|
||||||
|
}
|
||||||
|
}
|
331
enterprise/coderd/prebuilds/metricscollector_test.go
Normal file
331
enterprise/coderd/prebuilds/metricscollector_test.go
Normal file
@ -0,0 +1,331 @@
|
|||||||
|
package prebuilds_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
"tailscale.com/types/ptr"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
prometheus_client "github.com/prometheus/client_model/go"
|
||||||
|
|
||||||
|
"cdr.dev/slog/sloggers/slogtest"
|
||||||
|
"github.com/coder/quartz"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/database"
|
||||||
|
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||||
|
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||||
|
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
|
||||||
|
"github.com/coder/coder/v2/codersdk"
|
||||||
|
"github.com/coder/coder/v2/enterprise/coderd/prebuilds"
|
||||||
|
"github.com/coder/coder/v2/testutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMetricsCollector(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if !dbtestutil.WillUsePostgres() {
|
||||||
|
t.Skip("this test requires postgres")
|
||||||
|
}
|
||||||
|
|
||||||
|
type metricCheck struct {
|
||||||
|
name string
|
||||||
|
value *float64
|
||||||
|
isCounter bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type testCase struct {
|
||||||
|
name string
|
||||||
|
transitions []database.WorkspaceTransition
|
||||||
|
jobStatuses []database.ProvisionerJobStatus
|
||||||
|
initiatorIDs []uuid.UUID
|
||||||
|
ownerIDs []uuid.UUID
|
||||||
|
metrics []metricCheck
|
||||||
|
templateDeleted []bool
|
||||||
|
eligible []bool
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []testCase{
|
||||||
|
{
|
||||||
|
name: "prebuild provisioned but not completed",
|
||||||
|
transitions: allTransitions,
|
||||||
|
jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning, database.ProvisionerJobStatusCanceling),
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "prebuild running",
|
||||||
|
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
|
||||||
|
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "prebuild failed",
|
||||||
|
transitions: allTransitions,
|
||||||
|
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusFailed},
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_failed_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "prebuild eligible",
|
||||||
|
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
|
||||||
|
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(1.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{true},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "prebuild ineligible",
|
||||||
|
transitions: allTransitions,
|
||||||
|
jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusSucceeded),
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "prebuild claimed",
|
||||||
|
transitions: allTransitions,
|
||||||
|
jobStatuses: allJobStatuses,
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{uuid.New()},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(1.0), true},
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "workspaces that were not created by the prebuilds user are not counted",
|
||||||
|
transitions: allTransitions,
|
||||||
|
jobStatuses: allJobStatuses,
|
||||||
|
initiatorIDs: []uuid.UUID{uuid.New()},
|
||||||
|
ownerIDs: []uuid.UUID{uuid.New()},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{false},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "deleted templates never desire prebuilds",
|
||||||
|
transitions: allTransitions,
|
||||||
|
jobStatuses: allJobStatuses,
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_desired", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{true},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "running prebuilds for deleted templates are still counted, so that they can be deleted",
|
||||||
|
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
|
||||||
|
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
|
||||||
|
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
|
||||||
|
metrics: []metricCheck{
|
||||||
|
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
|
||||||
|
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
|
||||||
|
},
|
||||||
|
templateDeleted: []bool{true},
|
||||||
|
eligible: []bool{false},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
test := test // capture for parallel
|
||||||
|
for _, transition := range test.transitions {
|
||||||
|
transition := transition // capture for parallel
|
||||||
|
for _, jobStatus := range test.jobStatuses {
|
||||||
|
jobStatus := jobStatus // capture for parallel
|
||||||
|
for _, initiatorID := range test.initiatorIDs {
|
||||||
|
initiatorID := initiatorID // capture for parallel
|
||||||
|
for _, ownerID := range test.ownerIDs {
|
||||||
|
ownerID := ownerID // capture for parallel
|
||||||
|
for _, templateDeleted := range test.templateDeleted {
|
||||||
|
templateDeleted := templateDeleted // capture for parallel
|
||||||
|
for _, eligible := range test.eligible {
|
||||||
|
eligible := eligible // capture for parallel
|
||||||
|
t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
|
||||||
|
t.Cleanup(func() {
|
||||||
|
if t.Failed() {
|
||||||
|
t.Logf("failed to run test: %s", test.name)
|
||||||
|
t.Logf("transition: %s", transition)
|
||||||
|
t.Logf("jobStatus: %s", jobStatus)
|
||||||
|
t.Logf("initiatorID: %s", initiatorID)
|
||||||
|
t.Logf("ownerID: %s", ownerID)
|
||||||
|
t.Logf("templateDeleted: %t", templateDeleted)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
clock := quartz.NewMock(t)
|
||||||
|
db, pubsub := dbtestutil.NewDB(t)
|
||||||
|
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
|
ctx := testutil.Context(t, testutil.WaitLong)
|
||||||
|
|
||||||
|
createdUsers := []uuid.UUID{agplprebuilds.SystemUserID}
|
||||||
|
for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) {
|
||||||
|
if !slices.Contains(createdUsers, user) {
|
||||||
|
dbgen.User(t, db, database.User{
|
||||||
|
ID: user,
|
||||||
|
})
|
||||||
|
createdUsers = append(createdUsers, user)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
collector := prebuilds.NewMetricsCollector(db, logger, reconciler)
|
||||||
|
registry := prometheus.NewPedanticRegistry()
|
||||||
|
registry.Register(collector)
|
||||||
|
|
||||||
|
numTemplates := 2
|
||||||
|
for i := 0; i < numTemplates; i++ {
|
||||||
|
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
|
||||||
|
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID)
|
||||||
|
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
|
||||||
|
workspace := setupTestDBWorkspace(
|
||||||
|
t, clock, db, pubsub,
|
||||||
|
transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID,
|
||||||
|
)
|
||||||
|
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
|
||||||
|
}
|
||||||
|
|
||||||
|
metricsFamilies, err := registry.Gather()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
templates, err := db.GetTemplates(ctx)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, numTemplates, len(templates))
|
||||||
|
|
||||||
|
for _, template := range templates {
|
||||||
|
org, err := db.GetOrganizationByID(ctx, template.OrganizationID)
|
||||||
|
require.NoError(t, err)
|
||||||
|
templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{
|
||||||
|
TemplateID: template.ID,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, len(templateVersions))
|
||||||
|
|
||||||
|
presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, len(presets))
|
||||||
|
|
||||||
|
for _, preset := range presets {
|
||||||
|
preset := preset // capture for parallel
|
||||||
|
labels := map[string]string{
|
||||||
|
"template_name": template.Name,
|
||||||
|
"preset_name": preset.Name,
|
||||||
|
"organization_name": org.Name,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, check := range test.metrics {
|
||||||
|
metric := findMetric(metricsFamilies, check.name, labels)
|
||||||
|
if check.value == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
require.NotNil(t, metric, "metric %s should exist", check.name)
|
||||||
|
|
||||||
|
if check.isCounter {
|
||||||
|
require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name)
|
||||||
|
} else {
|
||||||
|
require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric {
|
||||||
|
for _, metricFamily := range metricsFamilies {
|
||||||
|
if metricFamily.GetName() != name {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, metric := range metricFamily.GetMetric() {
|
||||||
|
labelPairs := metric.GetLabel()
|
||||||
|
|
||||||
|
// Convert label pairs to map for easier lookup
|
||||||
|
metricLabels := make(map[string]string, len(labelPairs))
|
||||||
|
for _, label := range labelPairs {
|
||||||
|
metricLabels[label.GetName()] = label.GetValue()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if all requested labels match
|
||||||
|
for wantName, wantValue := range labels {
|
||||||
|
if metricLabels[wantName] != wantValue {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return metric
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
@ -9,6 +9,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/go-multierror"
|
"github.com/hashicorp/go-multierror"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
"github.com/coder/quartz"
|
"github.com/coder/quartz"
|
||||||
|
|
||||||
@ -31,11 +32,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type StoreReconciler struct {
|
type StoreReconciler struct {
|
||||||
store database.Store
|
store database.Store
|
||||||
cfg codersdk.PrebuildsConfig
|
cfg codersdk.PrebuildsConfig
|
||||||
pubsub pubsub.Pubsub
|
pubsub pubsub.Pubsub
|
||||||
logger slog.Logger
|
logger slog.Logger
|
||||||
clock quartz.Clock
|
clock quartz.Clock
|
||||||
|
registerer prometheus.Registerer
|
||||||
|
metrics *MetricsCollector
|
||||||
|
|
||||||
cancelFn context.CancelCauseFunc
|
cancelFn context.CancelCauseFunc
|
||||||
running atomic.Bool
|
running atomic.Bool
|
||||||
@ -45,21 +48,30 @@ type StoreReconciler struct {
|
|||||||
|
|
||||||
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
|
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
|
||||||
|
|
||||||
func NewStoreReconciler(
|
func NewStoreReconciler(store database.Store,
|
||||||
store database.Store,
|
|
||||||
ps pubsub.Pubsub,
|
ps pubsub.Pubsub,
|
||||||
cfg codersdk.PrebuildsConfig,
|
cfg codersdk.PrebuildsConfig,
|
||||||
logger slog.Logger,
|
logger slog.Logger,
|
||||||
clock quartz.Clock,
|
clock quartz.Clock,
|
||||||
|
registerer prometheus.Registerer,
|
||||||
) *StoreReconciler {
|
) *StoreReconciler {
|
||||||
return &StoreReconciler{
|
reconciler := &StoreReconciler{
|
||||||
store: store,
|
store: store,
|
||||||
pubsub: ps,
|
pubsub: ps,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
cfg: cfg,
|
cfg: cfg,
|
||||||
clock: clock,
|
clock: clock,
|
||||||
done: make(chan struct{}, 1),
|
registerer: registerer,
|
||||||
|
done: make(chan struct{}, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
|
||||||
|
if err := registerer.Register(reconciler.metrics); err != nil {
|
||||||
|
// If the registerer fails to register the metrics collector, it's not fatal.
|
||||||
|
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
|
||||||
|
}
|
||||||
|
|
||||||
|
return reconciler
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *StoreReconciler) Run(ctx context.Context) {
|
func (c *StoreReconciler) Run(ctx context.Context) {
|
||||||
@ -128,6 +140,17 @@ func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unregister the metrics collector.
|
||||||
|
if c.metrics != nil && c.registerer != nil {
|
||||||
|
if !c.registerer.Unregister(c.metrics) {
|
||||||
|
// The API doesn't allow us to know why the de-registration failed, but it's not very consequential.
|
||||||
|
// The only time this would be an issue is if the premium license is removed, leading to the feature being
|
||||||
|
// disabled (and consequently this Stop method being called), and then adding a new license which enables the
|
||||||
|
// feature again. If the metrics cannot be registered, it'll log an error from NewStoreReconciler.
|
||||||
|
c.logger.Warn(context.Background(), "failed to unregister metrics collector")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If the reconciler is not running, there's nothing else to do.
|
// If the reconciler is not running, there's nothing else to do.
|
||||||
if !c.running.Load() {
|
if !c.running.Load() {
|
||||||
return
|
return
|
||||||
|
@ -8,6 +8,9 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
|
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||||
"github.com/coder/coder/v2/coderd/util/slice"
|
"github.com/coder/coder/v2/coderd/util/slice"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
@ -45,7 +48,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) {
|
|||||||
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
|
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
|
||||||
}
|
}
|
||||||
logger := testutil.Logger(t)
|
logger := testutil.Logger(t)
|
||||||
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t))
|
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
|
|
||||||
// given a template version with no presets
|
// given a template version with no presets
|
||||||
org := dbgen.Organization(t, db, database.Organization{})
|
org := dbgen.Organization(t, db, database.Organization{})
|
||||||
@ -90,7 +93,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) {
|
|||||||
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
|
ReconciliationInterval: serpent.Duration(testutil.WaitLong),
|
||||||
}
|
}
|
||||||
logger := testutil.Logger(t)
|
logger := testutil.Logger(t)
|
||||||
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t))
|
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
|
|
||||||
// given there are presets, but no prebuilds
|
// given there are presets, but no prebuilds
|
||||||
org := dbgen.Organization(t, db, database.Organization{})
|
org := dbgen.Organization(t, db, database.Organization{})
|
||||||
@ -317,7 +320,7 @@ func TestPrebuildReconciliation(t *testing.T) {
|
|||||||
t, &slogtest.Options{IgnoreErrors: true},
|
t, &slogtest.Options{IgnoreErrors: true},
|
||||||
).Leveled(slog.LevelDebug)
|
).Leveled(slog.LevelDebug)
|
||||||
db, pubSub := dbtestutil.NewDB(t)
|
db, pubSub := dbtestutil.NewDB(t)
|
||||||
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t))
|
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
|
|
||||||
ownerID := uuid.New()
|
ownerID := uuid.New()
|
||||||
dbgen.User(t, db, database.User{
|
dbgen.User(t, db, database.User{
|
||||||
@ -419,7 +422,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
|
|||||||
t, &slogtest.Options{IgnoreErrors: true},
|
t, &slogtest.Options{IgnoreErrors: true},
|
||||||
).Leveled(slog.LevelDebug)
|
).Leveled(slog.LevelDebug)
|
||||||
db, pubSub := dbtestutil.NewDB(t)
|
db, pubSub := dbtestutil.NewDB(t)
|
||||||
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t))
|
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
|
|
||||||
ownerID := uuid.New()
|
ownerID := uuid.New()
|
||||||
dbgen.User(t, db, database.User{
|
dbgen.User(t, db, database.User{
|
||||||
@ -503,7 +506,7 @@ func TestInvalidPreset(t *testing.T) {
|
|||||||
t, &slogtest.Options{IgnoreErrors: true},
|
t, &slogtest.Options{IgnoreErrors: true},
|
||||||
).Leveled(slog.LevelDebug)
|
).Leveled(slog.LevelDebug)
|
||||||
db, pubSub := dbtestutil.NewDB(t)
|
db, pubSub := dbtestutil.NewDB(t)
|
||||||
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t))
|
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
|
||||||
|
|
||||||
ownerID := uuid.New()
|
ownerID := uuid.New()
|
||||||
dbgen.User(t, db, database.User{
|
dbgen.User(t, db, database.User{
|
||||||
@ -575,7 +578,7 @@ func TestRunLoop(t *testing.T) {
|
|||||||
t, &slogtest.Options{IgnoreErrors: true},
|
t, &slogtest.Options{IgnoreErrors: true},
|
||||||
).Leveled(slog.LevelDebug)
|
).Leveled(slog.LevelDebug)
|
||||||
db, pubSub := dbtestutil.NewDB(t)
|
db, pubSub := dbtestutil.NewDB(t)
|
||||||
reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock)
|
reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry())
|
||||||
|
|
||||||
ownerID := uuid.New()
|
ownerID := uuid.New()
|
||||||
dbgen.User(t, db, database.User{
|
dbgen.User(t, db, database.User{
|
||||||
@ -705,7 +708,7 @@ func TestFailedBuildBackoff(t *testing.T) {
|
|||||||
t, &slogtest.Options{IgnoreErrors: true},
|
t, &slogtest.Options{IgnoreErrors: true},
|
||||||
).Leveled(slog.LevelDebug)
|
).Leveled(slog.LevelDebug)
|
||||||
db, ps := dbtestutil.NewDB(t)
|
db, ps := dbtestutil.NewDB(t)
|
||||||
reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock)
|
reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock, prometheus.NewRegistry())
|
||||||
|
|
||||||
// Given: an active template version with presets and prebuilds configured.
|
// Given: an active template version with presets and prebuilds configured.
|
||||||
const desiredInstances = 2
|
const desiredInstances = 2
|
||||||
@ -820,7 +823,7 @@ func TestReconciliationLock(t *testing.T) {
|
|||||||
codersdk.PrebuildsConfig{},
|
codersdk.PrebuildsConfig{},
|
||||||
slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug),
|
slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug),
|
||||||
quartz.NewMock(t),
|
quartz.NewMock(t),
|
||||||
)
|
prometheus.NewRegistry())
|
||||||
reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error {
|
reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error {
|
||||||
lockObtained := mutex.TryLock()
|
lockObtained := mutex.TryLock()
|
||||||
// As long as the postgres lock is held, this mutex should always be unlocked when we get here.
|
// As long as the postgres lock is held, this mutex should always be unlocked when we get here.
|
||||||
@ -1009,6 +1012,30 @@ func setupTestDBWorkspace(
|
|||||||
return workspace
|
return workspace
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// nolint:revive // It's a control flag, but this is a test.
|
||||||
|
func setupTestDBWorkspaceAgent(t *testing.T, db database.Store, workspaceID uuid.UUID, eligible bool) database.WorkspaceAgent {
|
||||||
|
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(t.Context(), workspaceID)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
res := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: build.JobID})
|
||||||
|
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
|
||||||
|
ResourceID: res.ID,
|
||||||
|
})
|
||||||
|
|
||||||
|
// A prebuilt workspace is considered eligible when its agent is in a "ready" lifecycle state.
|
||||||
|
// i.e. connected to the control plane and all startup scripts have run.
|
||||||
|
if eligible {
|
||||||
|
require.NoError(t, db.UpdateWorkspaceAgentLifecycleStateByID(t.Context(), database.UpdateWorkspaceAgentLifecycleStateByIDParams{
|
||||||
|
ID: agent.ID,
|
||||||
|
LifecycleState: database.WorkspaceAgentLifecycleStateReady,
|
||||||
|
StartedAt: sql.NullTime{Time: dbtime.Now().Add(-time.Minute), Valid: true},
|
||||||
|
ReadyAt: sql.NullTime{Time: dbtime.Now(), Valid: true},
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
return agent
|
||||||
|
}
|
||||||
|
|
||||||
var allTransitions = []database.WorkspaceTransition{
|
var allTransitions = []database.WorkspaceTransition{
|
||||||
database.WorkspaceTransitionStart,
|
database.WorkspaceTransitionStart,
|
||||||
database.WorkspaceTransitionStop,
|
database.WorkspaceTransitionStop,
|
||||||
@ -1024,4 +1051,8 @@ var allJobStatuses = []database.ProvisionerJobStatus{
|
|||||||
database.ProvisionerJobStatusCanceling,
|
database.ProvisionerJobStatusCanceling,
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO (sasswart): test mutual exclusion
|
func allJobStatusesExcept(except ...database.ProvisionerJobStatus) []database.ProvisionerJobStatus {
|
||||||
|
return slice.Filter(except, func(status database.ProvisionerJobStatus) bool {
|
||||||
|
return !slice.Contains(allJobStatuses, status)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user