feat: expose agent metrics via Prometheus endpoint (#7011)

* WIP

* WIP

* WIP

* Agents

* fix

* 1min

* fix

* WIP

* Test

* docs

* fmt

* Add timer to measure the metrics collection

* Use CachedGaugeVec

* Unit tests

* Address PR comments
This commit is contained in:
Marcin Tojek
2023-04-07 17:48:52 +02:00
committed by GitHub
parent dd85ea8977
commit 0347231bb8
7 changed files with 629 additions and 48 deletions

View File

@ -896,6 +896,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
return xerrors.Errorf("create coder API: %w", err)
}
if cfg.Prometheus.Enable {
// Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API.
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0)
if err != nil {
return xerrors.Errorf("register agents prometheus metric: %w", err)
}
defer closeAgentsFunc()
}
client := codersdk.New(localURL)
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
// The certificate will likely be self-signed or for a different

View File

@ -0,0 +1,95 @@
package prometheusmetrics
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
)
// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows
// for staging changes in the metrics vector. Calling "WithLabelValues(...)"
// will update the internal gauge value, but it will not be returned by
// "Collect(...)" until the "Commit()" method is called. The "Commit()" method
// resets the internal gauge and applies all staged changes to it.
//
// The Use of CachedGaugeVec is recommended for use cases when there is a risk
// that the Prometheus collector receives incomplete metrics, collected
// in the middle of metrics recalculation, between "Reset()" and the last
// "WithLabelValues()" call.
type CachedGaugeVec struct {
m sync.Mutex
gaugeVec *prometheus.GaugeVec
records []vectorRecord
}
var _ prometheus.Collector = new(CachedGaugeVec)
type VectorOperation int
const (
VectorOperationAdd VectorOperation = iota
VectorOperationSet
)
type vectorRecord struct {
operation VectorOperation
value float64
labelValues []string
}
func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec {
return &CachedGaugeVec{
gaugeVec: gaugeVec,
}
}
func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) {
v.gaugeVec.Describe(desc)
}
func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) {
v.m.Lock()
defer v.m.Unlock()
v.gaugeVec.Collect(ch)
}
func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) {
switch operation {
case VectorOperationAdd:
case VectorOperationSet:
default:
panic("unsupported vector operation")
}
v.m.Lock()
defer v.m.Unlock()
v.records = append(v.records, vectorRecord{
operation: operation,
value: value,
labelValues: labelValues,
})
}
// Commit will set the internal value as the cached value to return from "Collect()".
// The internal metric value is completely reset, so the caller should expect
// the gauge to be empty for the next 'WithLabelValues' values.
func (v *CachedGaugeVec) Commit() {
v.m.Lock()
defer v.m.Unlock()
v.gaugeVec.Reset()
for _, record := range v.records {
g := v.gaugeVec.WithLabelValues(record.labelValues...)
switch record.operation {
case VectorOperationAdd:
g.Add(record.value)
case VectorOperationSet:
g.Set(record.value)
}
}
v.records = nil
}

View File

@ -0,0 +1,140 @@
package prometheusmetrics_test
import (
"sort"
"testing"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/coder/coder/coderd/prometheusmetrics"
)
func TestCollector_Add(t *testing.T) {
t.Parallel()
// given
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))
// when
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace")
agentsGauge.Commit()
// then
ch := make(chan prometheus.Metric, 2)
agentsGauge.Collect(ch)
metrics := collectAndSortMetrics(t, agentsGauge, 2)
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value
}
func TestCollector_Set(t *testing.T) {
t.Parallel()
// given
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))
// when
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace")
agentsGauge.Commit()
// then
ch := make(chan prometheus.Metric, 2)
agentsGauge.Collect(ch)
metrics := collectAndSortMetrics(t, agentsGauge, 2)
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
}
func TestCollector_Set_Add(t *testing.T) {
t.Parallel()
// given
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))
// when
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace")
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace")
agentsGauge.Commit()
// then
ch := make(chan prometheus.Metric, 2)
agentsGauge.Collect(ch)
metrics := collectAndSortMetrics(t, agentsGauge, 2)
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
}
func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric {
ch := make(chan prometheus.Metric, count)
defer close(ch)
var metrics []dto.Metric
collector.Collect(ch)
for i := 0; i < count; i++ {
m := <-ch
var metric dto.Metric
err := m.Write(&metric)
require.NoError(t, err)
metrics = append(metrics, metric)
}
// Ensure always the same order of metrics
sort.Slice(metrics, func(i, j int) bool {
return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()})
})
return metrics
}

View File

@ -2,13 +2,24 @@ package prometheusmetrics
import (
"context"
"database/sql"
"errors"
"fmt"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"tailscale.com/tailcfg"
"cdr.dev/slog"
"github.com/coder/coder/coderd"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbauthz"
"github.com/coder/coder/tailnet"
)
// ActiveUsers tracks the number of users that have authenticated within the past hour.
@ -106,3 +117,175 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
}()
return cancelFunc, nil
}
// Agents tracks the total number of workspaces with labels on status.
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) {
if duration == 0 {
duration = 1 * time.Minute
}
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "up",
Help: "The number of active agents per workspace.",
}, []string{"username", "workspace_name"}))
err := registerer.Register(agentsGauge)
if err != nil {
return nil, err
}
agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "connections",
Help: "Agent connections with statuses.",
}, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}))
err = registerer.Register(agentsConnectionsGauge)
if err != nil {
return nil, err
}
agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "connection_latencies_seconds",
Help: "Agent connection latencies in seconds.",
}, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}))
err = registerer.Register(agentsConnectionLatenciesGauge)
if err != nil {
return nil, err
}
agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agents",
Name: "apps",
Help: "Agent applications with statuses.",
}, []string{"agent_name", "username", "workspace_name", "app_name", "health"}))
err = registerer.Register(agentsAppsGauge)
if err != nil {
return nil, err
}
metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "prometheusmetrics",
Name: "agents_execution_seconds",
Help: "Histogram for duration of agents metrics collection in seconds.",
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
})
err = registerer.Register(metricsCollectorAgents)
if err != nil {
return nil, err
}
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx))
ticker := time.NewTicker(duration)
go func() {
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
logger.Debug(ctx, "Agent metrics collection is starting")
timer := prometheus.NewTimer(metricsCollectorAgents)
workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()),
})
if err != nil {
logger.Error(ctx, "can't get workspace rows", slog.Error(err))
continue
}
for _, workspace := range workspaceRows {
user, err := db.GetUserByID(ctx, workspace.OwnerID)
if err != nil {
logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err))
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
continue
}
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
if err != nil {
logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err))
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
continue
}
if len(agents) == 0 {
logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID))
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
continue
}
for _, agent := range agents {
// Collect information about agents
agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name)
connectionStatus := agent.Status(agentInactiveDisconnectTimeout)
node := (*coordinator.Load()).Node(agent.ID)
tailnetNode := "unknown"
if node != nil {
tailnetNode = node.ID.String()
}
agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode)
if node == nil {
logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID))
} else {
// Collect information about connection latencies
for rawRegion, latency := range node.DERPLatency {
regionParts := strings.SplitN(rawRegion, "-", 2)
regionID, err := strconv.Atoi(regionParts[0])
if err != nil {
logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err))
continue
}
region, found := derpMap.Regions[regionID]
if !found {
// It's possible that a workspace agent is using an old DERPMap
// and reports regions that do not exist. If that's the case,
// report the region as unknown!
region = &tailcfg.DERPRegion{
RegionID: regionID,
RegionName: fmt.Sprintf("Unnamed %d", regionID),
}
}
agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID))
}
}
// Collect information about registered applications
apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err))
continue
}
for _, app := range apps {
agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health))
}
}
}
agentsGauge.Commit()
agentsConnectionsGauge.Commit()
agentsConnectionLatenciesGauge.Commit()
agentsAppsGauge.Commit()
logger.Debug(ctx, "Agent metrics collection is done")
metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds())
}
}()
return cancelFunc, nil
}

View File

@ -3,6 +3,7 @@ package prometheusmetrics_test
import (
"context"
"database/sql"
"sync/atomic"
"testing"
"time"
@ -11,11 +12,18 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbfake"
"github.com/coder/coder/coderd/database/dbgen"
"github.com/coder/coder/coderd/prometheusmetrics"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/provisioner/echo"
"github.com/coder/coder/provisionersdk/proto"
"github.com/coder/coder/tailnet"
"github.com/coder/coder/tailnet/tailnettest"
"github.com/coder/coder/testutil"
)
@ -239,3 +247,108 @@ func TestWorkspaces(t *testing.T) {
})
}
}
func TestAgents(t *testing.T) {
t.Parallel()
// Build a sample workspace with test agent and fake application
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
db := api.Database
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.ProvisionComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "example",
Type: "aws_instance",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "testagent",
Directory: t.TempDir(),
Auth: &proto.Agent_Token{
Token: uuid.NewString(),
},
Apps: []*proto.App{
{
Slug: "fake-app",
DisplayName: "Fake application",
SharingLevel: proto.AppSharingLevel_OWNER,
// Hopefully this IP and port doesn't exist.
Url: "http://127.1.0.1:65535",
},
},
}},
}},
},
},
}},
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
// given
coordinator := tailnet.NewCoordinator()
coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{}
coordinatorPtr.Store(&coordinator)
derpMap := tailnettest.RunDERPAndSTUN(t)
agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests
registry := prometheus.NewRegistry()
// when
cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond)
t.Cleanup(cancel)
// then
require.NoError(t, err)
var agentsUp bool
var agentsConnections bool
var agentsApps bool
var agentsExecutionInSeconds bool
require.Eventually(t, func() bool {
metrics, err := registry.Gather()
assert.NoError(t, err)
if len(metrics) < 1 {
return false
}
for _, metric := range metrics {
switch metric.GetName() {
case "coderd_agents_up":
assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
agentsUp = true
case "coderd_agents_connections":
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state
assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status
assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node
assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
agentsConnections = true
case "coderd_agents_apps":
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name
assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health
assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username
assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
agentsApps = true
case "coderd_prometheusmetrics_agents_execution_seconds":
agentsExecutionInSeconds = true
default:
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
}
}
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
}, testutil.WaitShort, testutil.IntervalFast)
}

View File

@ -30,7 +30,11 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
| Name | Type | Description | Labels |
| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` |
| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` |
| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` |
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` |
| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | |
| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | |
| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | |
@ -38,6 +42,7 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically
| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` |
| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` |
| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` |
| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | |
| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` |
| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` |
| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` |

View File

@ -1,3 +1,23 @@
# HELP coderd_agents_apps Agent applications with statuses.
# TYPE coderd_agents_apps gauge
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1
# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds.
# TYPE coderd_agents_connection_latencies_seconds gauge
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416
# HELP coderd_agents_connections Agent connections with statuses.
# TYPE coderd_agents_connections gauge
coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1
coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1
coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1
# HELP coderd_agents_up The number of active agents per workspace.
# TYPE coderd_agents_up gauge
coderd_agents_up{username="admin",workspace_name="workspace-1"} 1
coderd_agents_up{username="admin",workspace_name="workspace-2"} 1
coderd_agents_up{username="admin",workspace_name="workspace-3"} 1
# HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds.
# TYPE coderd_api_websocket_durations_seconds histogram
coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0
@ -568,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files
# HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status.
# TYPE coderd_api_workspace_latest_build_total gauge
coderd_api_workspace_latest_build_total{status="succeeded"} 1
# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds.
# TYPE coderd_metrics_collector_agents_execution_seconds histogram
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2
coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2
coderd_metrics_collector_agents_execution_seconds_sum 0.0592915
coderd_metrics_collector_agents_execution_seconds_count 2
# HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds.
# TYPE coderd_provisionerd_job_timings_seconds histogram
coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0