feat: expose agent metrics via Prometheus endpoint (#7011)

* WIP

* WIP

* WIP

* Agents

* fix

* 1min

* fix

* WIP

* Test

* docs

* fmt

* Add timer to measure the metrics collection

* Use CachedGaugeVec

* Unit tests

* Address PR comments
This commit is contained in:
Marcin Tojek
2023-04-07 17:48:52 +02:00
committed by GitHub
parent dd85ea8977
commit 0347231bb8
7 changed files with 629 additions and 48 deletions

View File

@ -3,6 +3,7 @@ package prometheusmetrics_test
import (
"context"
"database/sql"
"sync/atomic"
"testing"
"time"
@ -11,11 +12,18 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbfake"
"github.com/coder/coder/coderd/database/dbgen"
"github.com/coder/coder/coderd/prometheusmetrics"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/provisioner/echo"
"github.com/coder/coder/provisionersdk/proto"
"github.com/coder/coder/tailnet"
"github.com/coder/coder/tailnet/tailnettest"
"github.com/coder/coder/testutil"
)
@ -239,3 +247,108 @@ func TestWorkspaces(t *testing.T) {
})
}
}
func TestAgents(t *testing.T) {
t.Parallel()
// Build a sample workspace with test agent and fake application
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
db := api.Database
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "example",
Type: "aws_instance",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "testagent",
Directory: t.TempDir(),
Auth: &proto.Agent_Token{
Token: uuid.NewString(),
},
Apps: []*proto.App{
{
Slug: "fake-app",
DisplayName: "Fake application",
SharingLevel: proto.AppSharingLevel_OWNER,
// Hopefully this IP and port doesn't exist.
Url: "http://127.1.0.1:65535",
},
},
}},
}},
},
},
}},
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
// given
coordinator := tailnet.NewCoordinator()
coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{}
coordinatorPtr.Store(&coordinator)
derpMap := tailnettest.RunDERPAndSTUN(t)
agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests
registry := prometheus.NewRegistry()
// when
cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond)
t.Cleanup(cancel)
// then
require.NoError(t, err)
var agentsUp bool
var agentsConnections bool
var agentsApps bool
var agentsExecutionInSeconds bool
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
if len(metrics) < 1 {
return false
}
for _, metric := range metrics {
switch metric.GetName() {
case "coderd_agents_up":
assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
agentsUp = true
case "coderd_agents_connections":
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state
assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status
assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node
assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
agentsConnections = true
case "coderd_agents_apps":
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name
assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health
assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
agentsApps = true
case "coderd_prometheusmetrics_agents_execution_seconds":
agentsExecutionInSeconds = true
default:
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
}
}
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
}, testutil.WaitShort, testutil.IntervalFast)
}