package prebuilds import ( "context" "fmt" "sync/atomic" "time" "github.com/prometheus/client_golang/prometheus" "golang.org/x/xerrors" "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/prebuilds" ) var ( labels = []string{"template_name", "preset_name", "organization_name"} createdPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_created_total", "Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+ "template preset.", labels, nil, ) failedPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_failed_total", "Total number of prebuilt workspaces that failed to build.", labels, nil, ) claimedPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_claimed_total", "Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+ "with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.", labels, nil, ) desiredPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_desired", "Target number of prebuilt workspaces that should be available for each template preset.", labels, nil, ) runningPrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_running", "Current number of prebuilt workspaces that are in a running state. These workspaces have started "+ "successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).", labels, nil, ) eligiblePrebuildsDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_eligible", "Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+ "have completed their build process with their agent reporting 'ready' status.", labels, nil, ) lastUpdateDesc = prometheus.NewDesc( "coderd_prebuilt_workspaces_metrics_last_updated", "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", []string{}, nil, ) ) const ( metricsUpdateInterval = time.Second * 15 metricsUpdateTimeout = time.Second * 10 ) type MetricsCollector struct { database database.Store logger slog.Logger snapshotter prebuilds.StateSnapshotter latestState atomic.Pointer[metricsState] } var _ prometheus.Collector = new(MetricsCollector) func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { log := logger.Named("prebuilds_metrics_collector") return &MetricsCollector{ database: db, logger: log, snapshotter: snapshotter, } } func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- createdPrebuildsDesc descCh <- failedPrebuildsDesc descCh <- claimedPrebuildsDesc descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc descCh <- lastUpdateDesc } // Collect uses the cached state to set configured metrics. // The state is cached because this function can be called multiple times per second and retrieving the current state // is an expensive operation. func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func. if currentState == nil { mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set") metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0) return } for _, metric := range currentState.prebuildMetrics { metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) } for _, preset := range currentState.snapshot.Presets { if !preset.UsingActiveVersion { continue } presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID) if err != nil { mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err)) continue } state := presetSnapshot.CalculateState() metricsCh <- prometheus.MustNewConstMetric(desiredPrebuildsDesc, prometheus.GaugeValue, float64(state.Desired), preset.TemplateName, preset.Name, preset.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } type metricsState struct { prebuildMetrics []database.GetPrebuildMetricsRow snapshot *prebuilds.GlobalSnapshot createdAt time.Time } // BackgroundFetch updates the metrics state every given interval. func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) { tick := time.NewTicker(time.Nanosecond) defer tick.Stop() for { select { case <-ctx.Done(): return case <-tick.C: // Tick immediately, then set regular interval. tick.Reset(updateInterval) if err := mc.UpdateState(ctx, updateTimeout); err != nil { mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err)) } } } } // UpdateState builds the current metrics state. func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { start := time.Now() fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) defer fetchCancel() prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx) if err != nil { return xerrors.Errorf("fetch prebuild metrics: %w", err) } snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database) if err != nil { return xerrors.Errorf("snapshot state: %w", err) } mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds()))) mc.latestState.Store(&metricsState{ prebuildMetrics: prebuildMetrics, snapshot: snapshot, createdAt: dbtime.Now(), }) return nil }