mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
feat: expose agent metrics via Prometheus endpoint (#7011)
* WIP * WIP * WIP * Agents * fix * 1min * fix * WIP * Test * docs * fmt * Add timer to measure the metrics collection * Use CachedGaugeVec * Unit tests * Address PR comments
This commit is contained in:
@ -3,6 +3,7 @@ package prometheusmetrics_test
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -11,11 +12,18 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"cdr.dev/slog/sloggers/slogtest"
|
||||
|
||||
"github.com/coder/coder/coderd/coderdtest"
|
||||
"github.com/coder/coder/coderd/database"
|
||||
"github.com/coder/coder/coderd/database/dbfake"
|
||||
"github.com/coder/coder/coderd/database/dbgen"
|
||||
"github.com/coder/coder/coderd/prometheusmetrics"
|
||||
"github.com/coder/coder/codersdk"
|
||||
"github.com/coder/coder/provisioner/echo"
|
||||
"github.com/coder/coder/provisionersdk/proto"
|
||||
"github.com/coder/coder/tailnet"
|
||||
"github.com/coder/coder/tailnet/tailnettest"
|
||||
"github.com/coder/coder/testutil"
|
||||
)
|
||||
|
||||
@ -239,3 +247,108 @@ func TestWorkspaces(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgents(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Build a sample workspace with test agent and fake application
|
||||
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
|
||||
db := api.Database
|
||||
|
||||
user := coderdtest.CreateFirstUser(t, client)
|
||||
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
|
||||
Parse: echo.ParseComplete,
|
||||
ProvisionPlan: echo.ProvisionComplete,
|
||||
ProvisionApply: []*proto.Provision_Response{{
|
||||
Type: &proto.Provision_Response_Complete{
|
||||
Complete: &proto.Provision_Complete{
|
||||
Resources: []*proto.Resource{{
|
||||
Name: "example",
|
||||
Type: "aws_instance",
|
||||
Agents: []*proto.Agent{{
|
||||
Id: uuid.NewString(),
|
||||
Name: "testagent",
|
||||
Directory: t.TempDir(),
|
||||
Auth: &proto.Agent_Token{
|
||||
Token: uuid.NewString(),
|
||||
},
|
||||
Apps: []*proto.App{
|
||||
{
|
||||
Slug: "fake-app",
|
||||
DisplayName: "Fake application",
|
||||
SharingLevel: proto.AppSharingLevel_OWNER,
|
||||
// Hopefully this IP and port doesn't exist.
|
||||
Url: "http://127.1.0.1:65535",
|
||||
},
|
||||
},
|
||||
}},
|
||||
}},
|
||||
},
|
||||
},
|
||||
}},
|
||||
})
|
||||
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
|
||||
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
|
||||
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
|
||||
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
|
||||
|
||||
// given
|
||||
coordinator := tailnet.NewCoordinator()
|
||||
coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{}
|
||||
coordinatorPtr.Store(&coordinator)
|
||||
derpMap := tailnettest.RunDERPAndSTUN(t)
|
||||
agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests
|
||||
registry := prometheus.NewRegistry()
|
||||
|
||||
// when
|
||||
cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
// then
|
||||
require.NoError(t, err)
|
||||
|
||||
var agentsUp bool
|
||||
var agentsConnections bool
|
||||
var agentsApps bool
|
||||
var agentsExecutionInSeconds bool
|
||||
require.Eventually(t, func() bool {
|
||||
metrics, err := registry.Gather()
|
||||
assert.NoError(t, err)
|
||||
|
||||
if len(metrics) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, metric := range metrics {
|
||||
switch metric.GetName() {
|
||||
case "coderd_agents_up":
|
||||
assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username
|
||||
assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name
|
||||
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
|
||||
agentsUp = true
|
||||
case "coderd_agents_connections":
|
||||
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
|
||||
assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state
|
||||
assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status
|
||||
assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node
|
||||
assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username
|
||||
assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name
|
||||
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
|
||||
agentsConnections = true
|
||||
case "coderd_agents_apps":
|
||||
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
|
||||
assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name
|
||||
assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health
|
||||
assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username
|
||||
assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name
|
||||
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
|
||||
agentsApps = true
|
||||
case "coderd_prometheusmetrics_agents_execution_seconds":
|
||||
agentsExecutionInSeconds = true
|
||||
default:
|
||||
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
|
||||
}
|
||||
}
|
||||
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
|
||||
}, testutil.WaitShort, testutil.IntervalFast)
|
||||
}
|
||||
|
Reference in New Issue
Block a user