feat: add prebuilds metrics collector (#17547)

Closes https://github.com/coder/internal/issues/509

---------

Signed-off-by: Danny Kopping <dannykopping@gmail.com>
This commit is contained in:
Danny Kopping
2025-04-28 12:28:56 +02:00
committed by GitHub
parent b47d54d777
commit e0483e3136
7 changed files with 548 additions and 26 deletions

View File

@ -5,6 +5,8 @@ import (
"github.com/google/uuid" "github.com/google/uuid"
"golang.org/x/xerrors" "golang.org/x/xerrors"
"github.com/coder/coder/v2/coderd/database"
) )
var ErrNoClaimablePrebuiltWorkspaces = xerrors.New("no claimable prebuilt workspaces found") var ErrNoClaimablePrebuiltWorkspaces = xerrors.New("no claimable prebuilt workspaces found")
@ -25,12 +27,23 @@ type ReconciliationOrchestrator interface {
} }
type Reconciler interface { type Reconciler interface {
StateSnapshotter
// ReconcileAll orchestrates the reconciliation of all prebuilds across all templates. // ReconcileAll orchestrates the reconciliation of all prebuilds across all templates.
// It takes a global snapshot of the system state and then reconciles each preset // It takes a global snapshot of the system state and then reconciles each preset
// in parallel, creating or deleting prebuilds as needed to reach their desired states. // in parallel, creating or deleting prebuilds as needed to reach their desired states.
ReconcileAll(ctx context.Context) error ReconcileAll(ctx context.Context) error
} }
// StateSnapshotter defines the operations necessary to capture workspace prebuilds state.
type StateSnapshotter interface {
// SnapshotState captures the current state of all prebuilds across templates.
// It creates a global database snapshot that can be viewed as a collection of PresetSnapshots,
// each representing the state of prebuilds for a specific preset.
// MUST be called inside a repeatable-read transaction.
SnapshotState(ctx context.Context, store database.Store) (*GlobalSnapshot, error)
}
type Claimer interface { type Claimer interface {
Claim(ctx context.Context, userID uuid.UUID, name string, presetID uuid.UUID) (*uuid.UUID, error) Claim(ctx context.Context, userID uuid.UUID, name string, presetID uuid.UUID) (*uuid.UUID, error)
Initiator() uuid.UUID Initiator() uuid.UUID

View File

@ -1165,6 +1165,6 @@ func (api *API) setupPrebuilds(featureEnabled bool) (agplprebuilds.Reconciliatio
} }
reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds, reconciler := prebuilds.NewStoreReconciler(api.Database, api.Pubsub, api.DeploymentValues.Prebuilds,
api.Logger.Named("prebuilds"), quartz.NewReal()) api.Logger.Named("prebuilds"), quartz.NewReal(), api.PrometheusRegistry)
return reconciler, prebuilds.EnterpriseClaimer{} return reconciler, prebuilds.EnterpriseClaimer{}
} }

View File

@ -10,6 +10,7 @@ import (
"time" "time"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"golang.org/x/xerrors" "golang.org/x/xerrors"
@ -142,7 +143,7 @@ func TestClaimPrebuild(t *testing.T) {
EntitlementsUpdateInterval: time.Second, EntitlementsUpdateInterval: time.Second,
}) })
reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t)) reconciler := prebuilds.NewStoreReconciler(spy, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry())
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(spy)
api.AGPL.PrebuildsClaimer.Store(&claimer) api.AGPL.PrebuildsClaimer.Store(&claimer)
@ -419,7 +420,7 @@ func TestClaimPrebuild_CheckDifferentErrors(t *testing.T) {
EntitlementsUpdateInterval: time.Second, EntitlementsUpdateInterval: time.Second,
}) })
reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t)) reconciler := prebuilds.NewStoreReconciler(errorStore, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), api.PrometheusRegistry)
var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(errorStore) var claimer agplprebuilds.Claimer = prebuilds.NewEnterpriseClaimer(errorStore)
api.AGPL.PrebuildsClaimer.Store(&claimer) api.AGPL.PrebuildsClaimer.Store(&claimer)

View File

@ -0,0 +1,123 @@
package prebuilds
import (
"context"
"time"
"cdr.dev/slog"
"github.com/prometheus/client_golang/prometheus"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/prebuilds"
)
var (
labels = []string{"template_name", "preset_name", "organization_name"}
createdPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_created_total",
"Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+
"template preset.",
labels,
nil,
)
failedPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_failed_total",
"Total number of prebuilt workspaces that failed to build.",
labels,
nil,
)
claimedPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_claimed_total",
"Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+
"with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.",
labels,
nil,
)
desiredPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_desired",
"Target number of prebuilt workspaces that should be available for each template preset.",
labels,
nil,
)
runningPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_running",
"Current number of prebuilt workspaces that are in a running state. These workspaces have started "+
"successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).",
labels,
nil,
)
eligiblePrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_eligible",
"Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+
"have completed their build process with their agent reporting 'ready' status.",
labels,
nil,
)
)
type MetricsCollector struct {
database database.Store
logger slog.Logger
snapshotter prebuilds.StateSnapshotter
}
var _ prometheus.Collector = new(MetricsCollector)
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
return &MetricsCollector{
database: db,
logger: logger.Named("prebuilds_metrics_collector"),
snapshotter: snapshotter,
}
}
func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- createdPrebuildsDesc
descCh <- failedPrebuildsDesc
descCh <- claimedPrebuildsDesc
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
}
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
// nolint:gocritic // We need to set an authz context to read metrics from the db.
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second)
defer cancel()
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx)
if err != nil {
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err))
return
}
for _, metric := range prebuildMetrics {
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
}
snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database)
if err != nil {
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err))
return
}
for _, preset := range snapshot.Presets {
if !preset.UsingActiveVersion {
continue
}
presetSnapshot, err := snapshot.FilterByPreset(preset.ID)
if err != nil {
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err))
continue
}
state := presetSnapshot.CalculateState()
metricsCh <- prometheus.MustNewConstMetric(desiredPrebuildsDesc, prometheus.GaugeValue, float64(state.Desired), preset.TemplateName, preset.Name, preset.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}
}

View File

@ -0,0 +1,331 @@
package prebuilds_test
import (
"fmt"
"slices"
"testing"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
"tailscale.com/types/ptr"
"github.com/prometheus/client_golang/prometheus"
prometheus_client "github.com/prometheus/client_model/go"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/enterprise/coderd/prebuilds"
"github.com/coder/coder/v2/testutil"
)
func TestMetricsCollector(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("this test requires postgres")
}
type metricCheck struct {
name string
value *float64
isCounter bool
}
type testCase struct {
name string
transitions []database.WorkspaceTransition
jobStatuses []database.ProvisionerJobStatus
initiatorIDs []uuid.UUID
ownerIDs []uuid.UUID
metrics []metricCheck
templateDeleted []bool
eligible []bool
}
tests := []testCase{
{
name: "prebuild provisioned but not completed",
transitions: allTransitions,
jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning, database.ProvisionerJobStatusCanceling),
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild running",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild failed",
transitions: allTransitions,
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusFailed},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild eligible",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(1.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{true},
},
{
name: "prebuild ineligible",
transitions: allTransitions,
jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusSucceeded),
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_failed_total", ptr.To(0.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild claimed",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_created_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_claimed_total", ptr.To(1.0), true},
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "workspaces that were not created by the prebuilds user are not counted",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{uuid.New()},
ownerIDs: []uuid.UUID{uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_desired", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_running", ptr.To(0.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "deleted templates never desire prebuilds",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_desired", ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
},
{
name: "running prebuilds for deleted templates are still counted, so that they can be deleted",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{"coderd_prebuilt_workspaces_running", ptr.To(1.0), false},
{"coderd_prebuilt_workspaces_eligible", ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
},
}
for _, test := range tests {
test := test // capture for parallel
for _, transition := range test.transitions {
transition := transition // capture for parallel
for _, jobStatus := range test.jobStatuses {
jobStatus := jobStatus // capture for parallel
for _, initiatorID := range test.initiatorIDs {
initiatorID := initiatorID // capture for parallel
for _, ownerID := range test.ownerIDs {
ownerID := ownerID // capture for parallel
for _, templateDeleted := range test.templateDeleted {
templateDeleted := templateDeleted // capture for parallel
for _, eligible := range test.eligible {
eligible := eligible // capture for parallel
t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) {
t.Parallel()
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
t.Cleanup(func() {
if t.Failed() {
t.Logf("failed to run test: %s", test.name)
t.Logf("transition: %s", transition)
t.Logf("jobStatus: %s", jobStatus)
t.Logf("initiatorID: %s", initiatorID)
t.Logf("ownerID: %s", ownerID)
t.Logf("templateDeleted: %t", templateDeleted)
}
})
clock := quartz.NewMock(t)
db, pubsub := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry())
ctx := testutil.Context(t, testutil.WaitLong)
createdUsers := []uuid.UUID{agplprebuilds.SystemUserID}
for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) {
if !slices.Contains(createdUsers, user) {
dbgen.User(t, db, database.User{
ID: user,
})
createdUsers = append(createdUsers, user)
}
}
collector := prebuilds.NewMetricsCollector(db, logger, reconciler)
registry := prometheus.NewPedanticRegistry()
registry.Register(collector)
numTemplates := 2
for i := 0; i < numTemplates; i++ {
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
workspace := setupTestDBWorkspace(
t, clock, db, pubsub,
transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID,
)
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
}
metricsFamilies, err := registry.Gather()
require.NoError(t, err)
templates, err := db.GetTemplates(ctx)
require.NoError(t, err)
require.Equal(t, numTemplates, len(templates))
for _, template := range templates {
org, err := db.GetOrganizationByID(ctx, template.OrganizationID)
require.NoError(t, err)
templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{
TemplateID: template.ID,
})
require.NoError(t, err)
require.Equal(t, 1, len(templateVersions))
presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID)
require.NoError(t, err)
require.Equal(t, 1, len(presets))
for _, preset := range presets {
preset := preset // capture for parallel
labels := map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
}
for _, check := range test.metrics {
metric := findMetric(metricsFamilies, check.name, labels)
if check.value == nil {
continue
}
require.NotNil(t, metric, "metric %s should exist", check.name)
if check.isCounter {
require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name)
} else {
require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name)
}
}
}
}
})
}
}
}
}
}
}
}
}
func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric {
for _, metricFamily := range metricsFamilies {
if metricFamily.GetName() != name {
continue
}
for _, metric := range metricFamily.GetMetric() {
labelPairs := metric.GetLabel()
// Convert label pairs to map for easier lookup
metricLabels := make(map[string]string, len(labelPairs))
for _, label := range labelPairs {
metricLabels[label.GetName()] = label.GetValue()
}
// Check if all requested labels match
for wantName, wantValue := range labels {
if metricLabels[wantName] != wantValue {
continue
}
}
return metric
}
}
return nil
}

View File

@ -9,6 +9,7 @@ import (
"time" "time"
"github.com/hashicorp/go-multierror" "github.com/hashicorp/go-multierror"
"github.com/prometheus/client_golang/prometheus"
"github.com/coder/quartz" "github.com/coder/quartz"
@ -31,11 +32,13 @@ import (
) )
type StoreReconciler struct { type StoreReconciler struct {
store database.Store store database.Store
cfg codersdk.PrebuildsConfig cfg codersdk.PrebuildsConfig
pubsub pubsub.Pubsub pubsub pubsub.Pubsub
logger slog.Logger logger slog.Logger
clock quartz.Clock clock quartz.Clock
registerer prometheus.Registerer
metrics *MetricsCollector
cancelFn context.CancelCauseFunc cancelFn context.CancelCauseFunc
running atomic.Bool running atomic.Bool
@ -45,21 +48,30 @@ type StoreReconciler struct {
var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{}
func NewStoreReconciler( func NewStoreReconciler(store database.Store,
store database.Store,
ps pubsub.Pubsub, ps pubsub.Pubsub,
cfg codersdk.PrebuildsConfig, cfg codersdk.PrebuildsConfig,
logger slog.Logger, logger slog.Logger,
clock quartz.Clock, clock quartz.Clock,
registerer prometheus.Registerer,
) *StoreReconciler { ) *StoreReconciler {
return &StoreReconciler{ reconciler := &StoreReconciler{
store: store, store: store,
pubsub: ps, pubsub: ps,
logger: logger, logger: logger,
cfg: cfg, cfg: cfg,
clock: clock, clock: clock,
done: make(chan struct{}, 1), registerer: registerer,
done: make(chan struct{}, 1),
} }
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
if err := registerer.Register(reconciler.metrics); err != nil {
// If the registerer fails to register the metrics collector, it's not fatal.
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
}
return reconciler
} }
func (c *StoreReconciler) Run(ctx context.Context) { func (c *StoreReconciler) Run(ctx context.Context) {
@ -128,6 +140,17 @@ func (c *StoreReconciler) Stop(ctx context.Context, cause error) {
return return
} }
// Unregister the metrics collector.
if c.metrics != nil && c.registerer != nil {
if !c.registerer.Unregister(c.metrics) {
// The API doesn't allow us to know why the de-registration failed, but it's not very consequential.
// The only time this would be an issue is if the premium license is removed, leading to the feature being
// disabled (and consequently this Stop method being called), and then adding a new license which enables the
// feature again. If the metrics cannot be registered, it'll log an error from NewStoreReconciler.
c.logger.Warn(context.Background(), "failed to unregister metrics collector")
}
}
// If the reconciler is not running, there's nothing else to do. // If the reconciler is not running, there's nothing else to do.
if !c.running.Load() { if !c.running.Load() {
return return

View File

@ -8,6 +8,9 @@ import (
"testing" "testing"
"time" "time"
"github.com/prometheus/client_golang/prometheus"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/util/slice" "github.com/coder/coder/v2/coderd/util/slice"
"github.com/google/uuid" "github.com/google/uuid"
@ -45,7 +48,7 @@ func TestNoReconciliationActionsIfNoPresets(t *testing.T) {
ReconciliationInterval: serpent.Duration(testutil.WaitLong), ReconciliationInterval: serpent.Duration(testutil.WaitLong),
} }
logger := testutil.Logger(t) logger := testutil.Logger(t)
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
// given a template version with no presets // given a template version with no presets
org := dbgen.Organization(t, db, database.Organization{}) org := dbgen.Organization(t, db, database.Organization{})
@ -90,7 +93,7 @@ func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) {
ReconciliationInterval: serpent.Duration(testutil.WaitLong), ReconciliationInterval: serpent.Duration(testutil.WaitLong),
} }
logger := testutil.Logger(t) logger := testutil.Logger(t)
controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
// given there are presets, but no prebuilds // given there are presets, but no prebuilds
org := dbgen.Organization(t, db, database.Organization{}) org := dbgen.Organization(t, db, database.Organization{})
@ -317,7 +320,7 @@ func TestPrebuildReconciliation(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true}, t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug) ).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t) db, pubSub := dbtestutil.NewDB(t)
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
ownerID := uuid.New() ownerID := uuid.New()
dbgen.User(t, db, database.User{ dbgen.User(t, db, database.User{
@ -419,7 +422,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true}, t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug) ).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t) db, pubSub := dbtestutil.NewDB(t)
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
ownerID := uuid.New() ownerID := uuid.New()
dbgen.User(t, db, database.User{ dbgen.User(t, db, database.User{
@ -503,7 +506,7 @@ func TestInvalidPreset(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true}, t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug) ).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t) db, pubSub := dbtestutil.NewDB(t)
controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t), prometheus.NewRegistry())
ownerID := uuid.New() ownerID := uuid.New()
dbgen.User(t, db, database.User{ dbgen.User(t, db, database.User{
@ -575,7 +578,7 @@ func TestRunLoop(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true}, t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug) ).Leveled(slog.LevelDebug)
db, pubSub := dbtestutil.NewDB(t) db, pubSub := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock) reconciler := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry())
ownerID := uuid.New() ownerID := uuid.New()
dbgen.User(t, db, database.User{ dbgen.User(t, db, database.User{
@ -705,7 +708,7 @@ func TestFailedBuildBackoff(t *testing.T) {
t, &slogtest.Options{IgnoreErrors: true}, t, &slogtest.Options{IgnoreErrors: true},
).Leveled(slog.LevelDebug) ).Leveled(slog.LevelDebug)
db, ps := dbtestutil.NewDB(t) db, ps := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock) reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock, prometheus.NewRegistry())
// Given: an active template version with presets and prebuilds configured. // Given: an active template version with presets and prebuilds configured.
const desiredInstances = 2 const desiredInstances = 2
@ -820,7 +823,7 @@ func TestReconciliationLock(t *testing.T) {
codersdk.PrebuildsConfig{}, codersdk.PrebuildsConfig{},
slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug), slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug),
quartz.NewMock(t), quartz.NewMock(t),
) prometheus.NewRegistry())
reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error { reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error {
lockObtained := mutex.TryLock() lockObtained := mutex.TryLock()
// As long as the postgres lock is held, this mutex should always be unlocked when we get here. // As long as the postgres lock is held, this mutex should always be unlocked when we get here.
@ -1009,6 +1012,30 @@ func setupTestDBWorkspace(
return workspace return workspace
} }
// nolint:revive // It's a control flag, but this is a test.
func setupTestDBWorkspaceAgent(t *testing.T, db database.Store, workspaceID uuid.UUID, eligible bool) database.WorkspaceAgent {
build, err := db.GetLatestWorkspaceBuildByWorkspaceID(t.Context(), workspaceID)
require.NoError(t, err)
res := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: build.JobID})
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: res.ID,
})
// A prebuilt workspace is considered eligible when its agent is in a "ready" lifecycle state.
// i.e. connected to the control plane and all startup scripts have run.
if eligible {
require.NoError(t, db.UpdateWorkspaceAgentLifecycleStateByID(t.Context(), database.UpdateWorkspaceAgentLifecycleStateByIDParams{
ID: agent.ID,
LifecycleState: database.WorkspaceAgentLifecycleStateReady,
StartedAt: sql.NullTime{Time: dbtime.Now().Add(-time.Minute), Valid: true},
ReadyAt: sql.NullTime{Time: dbtime.Now(), Valid: true},
}))
}
return agent
}
var allTransitions = []database.WorkspaceTransition{ var allTransitions = []database.WorkspaceTransition{
database.WorkspaceTransitionStart, database.WorkspaceTransitionStart,
database.WorkspaceTransitionStop, database.WorkspaceTransitionStop,
@ -1024,4 +1051,8 @@ var allJobStatuses = []database.ProvisionerJobStatus{
database.ProvisionerJobStatusCanceling, database.ProvisionerJobStatusCanceling,
} }
// TODO (sasswart): test mutual exclusion func allJobStatusesExcept(except ...database.ProvisionerJobStatus) []database.ProvisionerJobStatus {
return slice.Filter(except, func(status database.ProvisionerJobStatus) bool {
return !slice.Contains(allJobStatuses, status)
})
}