Files
coder/enterprise/coderd/prebuilds/metricscollector_test.go
Danny Kopping 6e967780c9 feat: track resource replacements when claiming a prebuilt workspace (#17571)
Closes https://github.com/coder/internal/issues/369

We can't know whether a replacement (i.e. drift of terraform state
leading to a resource needing to be deleted/recreated) will take place
apriori; we can only detect it at `plan` time, because the provider
decides whether a resource must be replaced and it cannot be inferred
through static analysis of the template.

**This is likely to be the most common gotcha with using prebuilds,
since it requires a slight template modification to use prebuilds
effectively**, so let's head this off before it's an issue for
customers.

Drift details will now be logged in the workspace build logs:


![image](https://github.com/user-attachments/assets/da1988b6-2cbe-4a79-a3c5-ea29891f3d6f)

Plus a notification will be sent to template admins when this situation
arises:


![image](https://github.com/user-attachments/assets/39d555b1-a262-4a3e-b529-03b9f23bf66a)

A new metric - `coderd_prebuilt_workspaces_resource_replacements_total`
- will also increment each time a workspace encounters replacements.

We only track _that_ a resource replacement occurred, not how many. Just
one is enough to ruin a prebuild, but we can't know apriori which
replacement would cause this.
For example, say we have 2 replacements: a `docker_container` and a
`null_resource`; we don't know which one might
cause an issue (or indeed if either would), so we just track the
replacement.

---------

Signed-off-by: Danny Kopping <dannykopping@gmail.com>
2025-05-14 14:52:22 +02:00

337 lines
12 KiB
Go

package prebuilds_test
import (
"fmt"
"slices"
"testing"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
"tailscale.com/types/ptr"
"github.com/prometheus/client_golang/prometheus"
prometheus_client "github.com/prometheus/client_model/go"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/enterprise/coderd/prebuilds"
"github.com/coder/coder/v2/testutil"
)
func TestMetricsCollector(t *testing.T) {
t.Parallel()
if !dbtestutil.WillUsePostgres() {
t.Skip("this test requires postgres")
}
type metricCheck struct {
name string
value *float64
isCounter bool
}
type testCase struct {
name string
transitions []database.WorkspaceTransition
jobStatuses []database.ProvisionerJobStatus
initiatorIDs []uuid.UUID
ownerIDs []uuid.UUID
metrics []metricCheck
templateDeleted []bool
eligible []bool
}
tests := []testCase{
{
name: "prebuild provisioned but not completed",
transitions: allTransitions,
jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusPending, database.ProvisionerJobStatusRunning, database.ProvisionerJobStatusCanceling),
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild running",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild failed",
transitions: allTransitions,
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusFailed},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricFailedCount, ptr.To(1.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild eligible",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(1.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{true},
},
{
name: "prebuild ineligible",
transitions: allTransitions,
jobStatuses: allJobStatusesExcept(database.ProvisionerJobStatusSucceeded),
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(0.0), true},
{prebuilds.MetricFailedCount, ptr.To(0.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "prebuild claimed",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{uuid.New()},
metrics: []metricCheck{
{prebuilds.MetricCreatedCount, ptr.To(1.0), true},
{prebuilds.MetricClaimedCount, ptr.To(1.0), true},
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "workspaces that were not created by the prebuilds user are not counted",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{uuid.New()},
ownerIDs: []uuid.UUID{uuid.New()},
metrics: []metricCheck{
{prebuilds.MetricDesiredGauge, ptr.To(1.0), false},
{prebuilds.MetricRunningGauge, ptr.To(0.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{false},
eligible: []bool{false},
},
{
name: "deleted templates never desire prebuilds",
transitions: allTransitions,
jobStatuses: allJobStatuses,
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID, uuid.New()},
metrics: []metricCheck{
{prebuilds.MetricDesiredGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
},
{
name: "running prebuilds for deleted templates are still counted, so that they can be deleted",
transitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart},
jobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded},
initiatorIDs: []uuid.UUID{agplprebuilds.SystemUserID},
ownerIDs: []uuid.UUID{agplprebuilds.SystemUserID},
metrics: []metricCheck{
{prebuilds.MetricRunningGauge, ptr.To(1.0), false},
{prebuilds.MetricEligibleGauge, ptr.To(0.0), false},
},
templateDeleted: []bool{true},
eligible: []bool{false},
},
}
for _, test := range tests {
test := test // capture for parallel
for _, transition := range test.transitions {
transition := transition // capture for parallel
for _, jobStatus := range test.jobStatuses {
jobStatus := jobStatus // capture for parallel
for _, initiatorID := range test.initiatorIDs {
initiatorID := initiatorID // capture for parallel
for _, ownerID := range test.ownerIDs {
ownerID := ownerID // capture for parallel
for _, templateDeleted := range test.templateDeleted {
templateDeleted := templateDeleted // capture for parallel
for _, eligible := range test.eligible {
eligible := eligible // capture for parallel
t.Run(fmt.Sprintf("%v/transition:%s/jobStatus:%s", test.name, transition, jobStatus), func(t *testing.T) {
t.Parallel()
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
t.Cleanup(func() {
if t.Failed() {
t.Logf("failed to run test: %s", test.name)
t.Logf("transition: %s", transition)
t.Logf("jobStatus: %s", jobStatus)
t.Logf("initiatorID: %s", initiatorID)
t.Logf("ownerID: %s", ownerID)
t.Logf("templateDeleted: %t", templateDeleted)
}
})
clock := quartz.NewMock(t)
db, pubsub := dbtestutil.NewDB(t)
reconciler := prebuilds.NewStoreReconciler(db, pubsub, codersdk.PrebuildsConfig{}, logger, quartz.NewMock(t), prometheus.NewRegistry(), newNoopEnqueuer())
ctx := testutil.Context(t, testutil.WaitLong)
createdUsers := []uuid.UUID{agplprebuilds.SystemUserID}
for _, user := range slices.Concat(test.ownerIDs, test.initiatorIDs) {
if !slices.Contains(createdUsers, user) {
dbgen.User(t, db, database.User{
ID: user,
})
createdUsers = append(createdUsers, user)
}
}
collector := prebuilds.NewMetricsCollector(db, logger, reconciler)
registry := prometheus.NewPedanticRegistry()
registry.Register(collector)
numTemplates := 2
for i := 0; i < numTemplates; i++ {
org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted)
templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubsub, org.ID, ownerID, template.ID)
preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String())
workspace, _ := setupTestDBWorkspace(
t, clock, db, pubsub,
transition, jobStatus, org.ID, preset, template.ID, templateVersionID, initiatorID, ownerID,
)
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
}
// Force an update to the metrics state to allow the collector to collect fresh metrics.
// nolint:gocritic // Authz context needed to retrieve state.
require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong))
metricsFamilies, err := registry.Gather()
require.NoError(t, err)
templates, err := db.GetTemplates(ctx)
require.NoError(t, err)
require.Equal(t, numTemplates, len(templates))
for _, template := range templates {
org, err := db.GetOrganizationByID(ctx, template.OrganizationID)
require.NoError(t, err)
templateVersions, err := db.GetTemplateVersionsByTemplateID(ctx, database.GetTemplateVersionsByTemplateIDParams{
TemplateID: template.ID,
})
require.NoError(t, err)
require.Equal(t, 1, len(templateVersions))
presets, err := db.GetPresetsByTemplateVersionID(ctx, templateVersions[0].ID)
require.NoError(t, err)
require.Equal(t, 1, len(presets))
for _, preset := range presets {
preset := preset // capture for parallel
labels := map[string]string{
"template_name": template.Name,
"preset_name": preset.Name,
"organization_name": org.Name,
}
for _, check := range test.metrics {
metric := findMetric(metricsFamilies, check.name, labels)
if check.value == nil {
continue
}
require.NotNil(t, metric, "metric %s should exist", check.name)
if check.isCounter {
require.Equal(t, *check.value, metric.GetCounter().GetValue(), "counter %s value mismatch", check.name)
} else {
require.Equal(t, *check.value, metric.GetGauge().GetValue(), "gauge %s value mismatch", check.name)
}
}
}
}
})
}
}
}
}
}
}
}
}
func findMetric(metricsFamilies []*prometheus_client.MetricFamily, name string, labels map[string]string) *prometheus_client.Metric {
for _, metricFamily := range metricsFamilies {
if metricFamily.GetName() != name {
continue
}
for _, metric := range metricFamily.GetMetric() {
labelPairs := metric.GetLabel()
// Convert label pairs to map for easier lookup
metricLabels := make(map[string]string, len(labelPairs))
for _, label := range labelPairs {
metricLabels[label.GetName()] = label.GetValue()
}
// Check if all requested labels match
for wantName, wantValue := range labels {
if metricLabels[wantName] != wantValue {
continue
}
}
return metric
}
}
return nil
}