From e0483e313678a4dd44ddeef5d8345d3e9fdf3e77 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 28 Apr 2025 12:28:56 +0200 Subject: [PATCH] feat: add prebuilds metrics collector (#17547) Closes https://github.com/coder/internal/issues/509 --------- Signed-off-by: Danny Kopping --- coderd/prebuilds/api.go | 13 + enterprise/coderd/coderd.go | 2 +- enterprise/coderd/prebuilds/claim_test.go | 5 +- .../coderd/prebuilds/metricscollector.go | 123 +++++++ .../coderd/prebuilds/metricscollector_test.go | 331 ++++++++++++++++++ enterprise/coderd/prebuilds/reconcile.go | 51 ++- enterprise/coderd/prebuilds/reconcile_test.go | 49 ++- 7 files changed, 548 insertions(+), 26 deletions(-) create mode 100644 enterprise/coderd/prebuilds/metricscollector.go create mode 100644 enterprise/coderd/prebuilds/metricscollector_test.go diff --git a/coderd/prebuilds/api.go b/coderd/prebuilds/api.go index ba174d318d..2342da5d62 100644 --- a/coderd/prebuilds/api.go +++ b/coderd/prebuilds/api.go @@ -5,6 +5,8 @@ import ( "github.com/google/uuid" "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/database" ) var ErrNoClaimablePrebuiltWorkspaces = xerrors.New("no claimable prebuilt workspaces found") @@ -25,12 +27,23 @@ type ReconciliationOrchestrator interface { } type Reconciler interface { + StateSnapshotter + // ReconcileAll orchestrates the reconciliation of all prebuilds across all templates. // It takes a global snapshot of the system state and then reconciles each preset // in parallel, creating or deleting prebuilds as needed to reach their desired states. ReconcileAll(ctx context.Context) error } +// StateSnapshotter defines the operations necessary to capture workspace prebuilds state. +type StateSnapshotter interface { + // SnapshotState captures the current state of all prebuilds across templates. + // It creates a global database snapshot that can be viewed as a collection of PresetSnapshots, + // each representing the state of prebuilds for a specific preset. + // MUST be called inside a repeatable-read transaction. + SnapshotState(ctx context.Context, store database.Store) (*GlobalSnapshot, error) +} + type Claimer interface { Claim(ctx context.Context, userID uuid.UUID, name string, presetID uuid.UUID) (*uuid.UUID, error) Initiator() uuid.UUID diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 1f468997ac..ca3531b60d 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -1165,6 +1165,6 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio } reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds, - api.Logger.Named("prebuilds"), quartz.NewReal()) + api.Logger.Named("prebuilds"), quartz.NewReal(), api.PrometheusRegistry) return reconciler, prebuilds.EnterpriseClaimer{} } diff --git a/enterprise/coderd/prebuilds/claim_test.go b/enterprise/coderd/prebuilds/claim_test.go index 4f398724b8..1573aab938 100644 --- a/enterprise/coderd/prebuilds/claim_test.go +++ b/enterprise/coderd/prebuilds/claim_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" "golang.org/x/xerrors" @@ -142,7 +143,7 @@ func TestClaimPrebuild(t *testing.T) { EntitlementsUpdateInterval: time.Second, }) - reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t)) + reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry()) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy) api.AGPL.PrebuildsClaimer.Store(&claimer) @@ -419,7 +420,7 @@ func TestClaimPrebuild_CheckDifferentErrors(t *testing.T) { EntitlementsUpdateInterval: time.Second, }) - reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t)) + reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), api.PrometheusRegistry) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(errorStore) api.AGPL.PrebuildsClaimer.Store(&claimer) diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go new file mode 100644 index 0000000000..7b55227eff --- /dev/null +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -0,0 +1,123 @@ +package prebuilds + +import ( + "context" + "time" + + "cdr.dev/slog" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/prebuilds" +) + +var ( + labels = []string{"template_name", "preset_name", "organization_name"} + createdPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_created_total", + "Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+ + "template preset.", + labels, + nil, + ) + failedPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_failed_total", + "Total number of prebuilt workspaces that failed to build.", + labels, + nil, + ) + claimedPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_claimed_total", + "Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+ + "with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.", + labels, + nil, + ) + desiredPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_desired", + "Target number of prebuilt workspaces that should be available for each template preset.", + labels, + nil, + ) + runningPrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_running", + "Current number of prebuilt workspaces that are in a running state. These workspaces have started "+ + "successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).", + labels, + nil, + ) + eligiblePrebuildsDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_eligible", + "Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+ + "have completed their build process with their agent reporting 'ready' status.", + labels, + nil, + ) +) + +type MetricsCollector struct { + database database.Store + logger slog.Logger + snapshotter prebuilds.StateSnapshotter +} + +var _ prometheus.Collector = new(MetricsCollector) + +func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { + return &MetricsCollector{ + database: db, + logger: logger.Named("prebuilds_metrics_collector"), + snapshotter: snapshotter, + } +} + +func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { + descCh <- createdPrebuildsDesc + descCh <- failedPrebuildsDesc + descCh <- claimedPrebuildsDesc + descCh <- desiredPrebuildsDesc + descCh <- runningPrebuildsDesc + descCh <- eligiblePrebuildsDesc +} + +func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { + // nolint:gocritic // We need to set an authz context to read metrics from the db. + ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second) + defer cancel() + prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx) + if err != nil { + mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err)) + return + } + + for _, metric := range prebuildMetrics { + metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) + } + + snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database) + if err != nil { + mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err)) + return + } + + for _, preset := range snapshot.Presets { + if !preset.UsingActiveVersion { + continue + } + + presetSnapshot, err := snapshot.FilterByPreset(preset.ID) + if err != nil { + mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err)) + continue + } + state := presetSnapshot.CalculateState() + + metricsCh <- prometheus.MustNewConstMetric(desiredPrebuildsDesc, prometheus.GaugeValue, float64(state.Desired), preset.TemplateName, preset.Name, preset.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) + metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) + } +} diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go new file mode 100644 index 0000000000..859509ced6 --- /dev/null +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -0,0 +1,331 @@ +package prebuilds_test + +import ( + "fmt" + "slices" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + "tailscale.com/types/ptr" + + "github.com/prometheus/client_golang/prometheus" + prometheus_client "github.com/prometheus/client_model/go" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/quartz" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbgen" + "github.com/coder/coder/v2/coderd/database/dbtestutil" + agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds" + "github.com/coder/coder/v2/codersdk" + "github.com/coder/coder/v2/enterprise/coderd/prebuilds" + "github.com/coder/coder/v2/testutil" +) + +func TestMetricsCollector(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("this test requires postgres") + } + + type metricCheck struct { + name string + value *float64 + isCounter bool + } + + type testCase struct { + name string + transitions []database.WorkspaceTransition + jobStatuses []database.ProvisionerJobStatus + initiatorIDs []uuid.UUID + ownerIDs []uuid.UUID + metrics []metricCheck + templateDeleted []bool + eligible []bool + } + + tests := []testCase{ + { + name: "prebuild provisioned but not completed", + transitions: allTransitions, + jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning, database.ProvisionerJobStatusCanceling), + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "prebuild running", + transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "prebuild failed", + transitions: allTransitions, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusFailed}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "prebuild eligible", + transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(1.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{true}, + }, + { + name: "prebuild ineligible", + transitions: allTransitions, + jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusSucceeded), + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "prebuild claimed", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_claimed_total", ptr.To(1.0), true}, + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "workspaces that were not created by the prebuilds user are not counted", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{uuid.New()}, + ownerIDs: []uuid.UUID{uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_running", ptr.To(0.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{false}, + eligible: []bool{false}, + }, + { + name: "deleted templates never desire prebuilds", + transitions: allTransitions, + jobStatuses: allJobStatuses, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_desired", ptr.To(0.0), false}, + }, + templateDeleted: []bool{true}, + eligible: []bool{false}, + }, + { + name: "running prebuilds for deleted templates are still counted, so that they can be deleted", + transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID}, + metrics: []metricCheck{ + {"coderd_prebuilt_workspaces_running", ptr.To(1.0), false}, + {"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false}, + }, + templateDeleted: []bool{true}, + eligible: []bool{false}, + }, + } + for _, test := range tests { + test := test // capture for parallel + for _, transition := range test.transitions { + transition := transition // capture for parallel + for _, jobStatus := range test.jobStatuses { + jobStatus := jobStatus // capture for parallel + for _, initiatorID := range test.initiatorIDs { + initiatorID := initiatorID // capture for parallel + for _, ownerID := range test.ownerIDs { + ownerID := ownerID // capture for parallel + for _, templateDeleted := range test.templateDeleted { + templateDeleted := templateDeleted // capture for parallel + for _, eligible := range test.eligible { + eligible := eligible // capture for parallel + t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) { + t.Parallel() + + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + t.Cleanup(func() { + if t.Failed() { + t.Logf("failed to run test: %s", test.name) + t.Logf("transition: %s", transition) + t.Logf("jobStatus: %s", jobStatus) + t.Logf("initiatorID: %s", initiatorID) + t.Logf("ownerID: %s", ownerID) + t.Logf("templateDeleted: %t", templateDeleted) + } + }) + clock := quartz.NewMock(t) + db, pubsub := dbtestutil.NewDB(t) + reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry()) + ctx := testutil.Context(t, testutil.WaitLong) + + createdUsers := []uuid.UUID{agplprebuilds.SystemUserID} + for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) { + if !slices.Contains(createdUsers, user) { + dbgen.User(t, db, database.User{ + ID: user, + }) + createdUsers = append(createdUsers, user) + } + } + + collector := prebuilds.NewMetricsCollector(db, logger, reconciler) + registry := prometheus.NewPedanticRegistry() + registry.Register(collector) + + numTemplates := 2 + for i := 0; i < numTemplates; i++ { + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String()) + workspace := setupTestDBWorkspace( + t, clock, db, pubsub, + transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID, + ) + setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible) + } + + metricsFamilies, err := registry.Gather() + require.NoError(t, err) + + templates, err := db.GetTemplates(ctx) + require.NoError(t, err) + require.Equal(t, numTemplates, len(templates)) + + for _, template := range templates { + org, err := db.GetOrganizationByID(ctx, template.OrganizationID) + require.NoError(t, err) + templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{ + TemplateID: template.ID, + }) + require.NoError(t, err) + require.Equal(t, 1, len(templateVersions)) + + presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID) + require.NoError(t, err) + require.Equal(t, 1, len(presets)) + + for _, preset := range presets { + preset := preset // capture for parallel + labels := map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "organization_name": org.Name, + } + + for _, check := range test.metrics { + metric := findMetric(metricsFamilies, check.name, labels) + if check.value == nil { + continue + } + + require.NotNil(t, metric, "metric %s should exist", check.name) + + if check.isCounter { + require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name) + } else { + require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name) + } + } + } + } + }) + } + } + } + } + } + } + } +} + +func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric { + for _, metricFamily := range metricsFamilies { + if metricFamily.GetName() != name { + continue + } + + for _, metric := range metricFamily.GetMetric() { + labelPairs := metric.GetLabel() + + // Convert label pairs to map for easier lookup + metricLabels := make(map[string]string, len(labelPairs)) + for _, label := range labelPairs { + metricLabels[label.GetName()] = label.GetValue() + } + + // Check if all requested labels match + for wantName, wantValue := range labels { + if metricLabels[wantName] != wantValue { + continue + } + } + + return metric + } + } + return nil +} diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 081b4223a9..134365b657 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -9,6 +9,7 @@ import ( "time" "github.com/hashicorp/go-multierror" + "github.com/prometheus/client_golang/prometheus" "github.com/coder/quartz" @@ -31,11 +32,13 @@ import ( ) type StoreReconciler struct { - store database.Store - cfg codersdk.PrebuildsConfig - pubsub pubsub.Pubsub - logger slog.Logger - clock quartz.Clock + store database.Store + cfg codersdk.PrebuildsConfig + pubsub pubsub.Pubsub + logger slog.Logger + clock quartz.Clock + registerer prometheus.Registerer + metrics *MetricsCollector cancelFn context.CancelCauseFunc running atomic.Bool @@ -45,21 +48,30 @@ type StoreReconciler struct { var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} -func NewStoreReconciler( - store database.Store, +func NewStoreReconciler(store database.Store, ps pubsub.Pubsub, cfg codersdk.PrebuildsConfig, logger slog.Logger, clock quartz.Clock, + registerer prometheus.Registerer, ) *StoreReconciler { - return &StoreReconciler{ - store: store, - pubsub: ps, - logger: logger, - cfg: cfg, - clock: clock, - done: make(chan struct{}, 1), + reconciler := &StoreReconciler{ + store: store, + pubsub: ps, + logger: logger, + cfg: cfg, + clock: clock, + registerer: registerer, + done: make(chan struct{}, 1), } + + reconciler.metrics = NewMetricsCollector(store, logger, reconciler) + if err := registerer.Register(reconciler.metrics); err != nil { + // If the registerer fails to register the metrics collector, it's not fatal. + logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) + } + + return reconciler } func (c *StoreReconciler) Run(ctx context.Context) { @@ -128,6 +140,17 @@ func (c *StoreReconciler) Stop(ctx context.Context, cause error) { return } + // Unregister the metrics collector. + if c.metrics != nil && c.registerer != nil { + if !c.registerer.Unregister(c.metrics) { + // The API doesn't allow us to know why the de-registration failed, but it's not very consequential. + // The only time this would be an issue is if the premium license is removed, leading to the feature being + // disabled (and consequently this Stop method being called), and then adding a new license which enables the + // feature again. If the metrics cannot be registered, it'll log an error from NewStoreReconciler. + c.logger.Warn(context.Background(), "failed to unregister metrics collector") + } + } + // If the reconciler is not running, there's nothing else to do. if !c.running.Load() { return diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 5c1ffe993e..9783b215f1 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -8,6 +8,9 @@ import ( "testing" "time" + "github.com/prometheus/client_golang/prometheus" + + "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/util/slice" "github.com/google/uuid" @@ -45,7 +48,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) { ReconciliationInterval: serpent.Duration(testutil.WaitLong), } logger := testutil.Logger(t) - controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) // given a template version with no presets org := dbgen.Organization(t, db, database.Organization{}) @@ -90,7 +93,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) { ReconciliationInterval: serpent.Duration(testutil.WaitLong), } logger := testutil.Logger(t) - controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) // given there are presets, but no prebuilds org := dbgen.Organization(t, db, database.Organization{}) @@ -317,7 +320,7 @@ func TestPrebuildReconciliation(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -419,7 +422,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -503,7 +506,7 @@ func TestInvalidPreset(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -575,7 +578,7 @@ func TestRunLoop(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) - reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock) + reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry()) ownerID := uuid.New() dbgen.User(t, db, database.User{ @@ -705,7 +708,7 @@ func TestFailedBuildBackoff(t *testing.T) { t, &slogtest.Options{IgnoreErrors: true}, ).Leveled(slog.LevelDebug) db, ps := dbtestutil.NewDB(t) - reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock) + reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock, prometheus.NewRegistry()) // Given: an active template version with presets and prebuilds configured. const desiredInstances = 2 @@ -820,7 +823,7 @@ func TestReconciliationLock(t *testing.T) { codersdk.PrebuildsConfig{}, slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug), quartz.NewMock(t), - ) + prometheus.NewRegistry()) reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error { lockObtained := mutex.TryLock() // As long as the postgres lock is held, this mutex should always be unlocked when we get here. @@ -1009,6 +1012,30 @@ func setupTestDBWorkspace( return workspace } +// nolint:revive // It's a control flag, but this is a test. +func setupTestDBWorkspaceAgent(t *testing.T, db database.Store, workspaceID uuid.UUID, eligible bool) database.WorkspaceAgent { + build, err := db.GetLatestWorkspaceBuildByWorkspaceID(t.Context(), workspaceID) + require.NoError(t, err) + + res := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: build.JobID}) + agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ + ResourceID: res.ID, + }) + + // A prebuilt workspace is considered eligible when its agent is in a "ready" lifecycle state. + // i.e. connected to the control plane and all startup scripts have run. + if eligible { + require.NoError(t, db.UpdateWorkspaceAgentLifecycleStateByID(t.Context(), database.UpdateWorkspaceAgentLifecycleStateByIDParams{ + ID: agent.ID, + LifecycleState: database.WorkspaceAgentLifecycleStateReady, + StartedAt: sql.NullTime{Time: dbtime.Now().Add(-time.Minute), Valid: true}, + ReadyAt: sql.NullTime{Time: dbtime.Now(), Valid: true}, + })) + } + + return agent +} + var allTransitions = []database.WorkspaceTransition{ database.WorkspaceTransitionStart, database.WorkspaceTransitionStop, @@ -1024,4 +1051,8 @@ var allJobStatuses = []database.ProvisionerJobStatus{ database.ProvisionerJobStatusCanceling, } -// TODO (sasswart): test mutual exclusion +func allJobStatusesExcept(except ...database.ProvisionerJobStatus) []database.ProvisionerJobStatus { + return slice.Filter(except, func(status database.ProvisionerJobStatus) bool { + return !slice.Contains(allJobStatuses, status) + }) +}