mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
feat: expose agent metrics via Prometheus endpoint (#7011)
* WIP * WIP * WIP * Agents * fix * 1min * fix * WIP * Test * docs * fmt * Add timer to measure the metrics collection * Use CachedGaugeVec * Unit tests * Address PR comments
This commit is contained in:
@ -896,6 +896,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
|
|||||||
return xerrors.Errorf("create coder API: %w", err)
|
return xerrors.Errorf("create coder API: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cfg.Prometheus.Enable {
|
||||||
|
// Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API.
|
||||||
|
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0)
|
||||||
|
if err != nil {
|
||||||
|
return xerrors.Errorf("register agents prometheus metric: %w", err)
|
||||||
|
}
|
||||||
|
defer closeAgentsFunc()
|
||||||
|
}
|
||||||
|
|
||||||
client := codersdk.New(localURL)
|
client := codersdk.New(localURL)
|
||||||
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
|
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
|
||||||
// The certificate will likely be self-signed or for a different
|
// The certificate will likely be self-signed or for a different
|
||||||
|
95
coderd/prometheusmetrics/collector.go
Normal file
95
coderd/prometheusmetrics/collector.go
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
package prometheusmetrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows
|
||||||
|
// for staging changes in the metrics vector. Calling "WithLabelValues(...)"
|
||||||
|
// will update the internal gauge value, but it will not be returned by
|
||||||
|
// "Collect(...)" until the "Commit()" method is called. The "Commit()" method
|
||||||
|
// resets the internal gauge and applies all staged changes to it.
|
||||||
|
//
|
||||||
|
// The Use of CachedGaugeVec is recommended for use cases when there is a risk
|
||||||
|
// that the Prometheus collector receives incomplete metrics, collected
|
||||||
|
// in the middle of metrics recalculation, between "Reset()" and the last
|
||||||
|
// "WithLabelValues()" call.
|
||||||
|
type CachedGaugeVec struct {
|
||||||
|
m sync.Mutex
|
||||||
|
|
||||||
|
gaugeVec *prometheus.GaugeVec
|
||||||
|
records []vectorRecord
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ prometheus.Collector = new(CachedGaugeVec)
|
||||||
|
|
||||||
|
type VectorOperation int
|
||||||
|
|
||||||
|
const (
|
||||||
|
VectorOperationAdd VectorOperation = iota
|
||||||
|
VectorOperationSet
|
||||||
|
)
|
||||||
|
|
||||||
|
type vectorRecord struct {
|
||||||
|
operation VectorOperation
|
||||||
|
value float64
|
||||||
|
labelValues []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec {
|
||||||
|
return &CachedGaugeVec{
|
||||||
|
gaugeVec: gaugeVec,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) {
|
||||||
|
v.gaugeVec.Describe(desc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) {
|
||||||
|
v.m.Lock()
|
||||||
|
defer v.m.Unlock()
|
||||||
|
|
||||||
|
v.gaugeVec.Collect(ch)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) {
|
||||||
|
switch operation {
|
||||||
|
case VectorOperationAdd:
|
||||||
|
case VectorOperationSet:
|
||||||
|
default:
|
||||||
|
panic("unsupported vector operation")
|
||||||
|
}
|
||||||
|
|
||||||
|
v.m.Lock()
|
||||||
|
defer v.m.Unlock()
|
||||||
|
|
||||||
|
v.records = append(v.records, vectorRecord{
|
||||||
|
operation: operation,
|
||||||
|
value: value,
|
||||||
|
labelValues: labelValues,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Commit will set the internal value as the cached value to return from "Collect()".
|
||||||
|
// The internal metric value is completely reset, so the caller should expect
|
||||||
|
// the gauge to be empty for the next 'WithLabelValues' values.
|
||||||
|
func (v *CachedGaugeVec) Commit() {
|
||||||
|
v.m.Lock()
|
||||||
|
defer v.m.Unlock()
|
||||||
|
|
||||||
|
v.gaugeVec.Reset()
|
||||||
|
for _, record := range v.records {
|
||||||
|
g := v.gaugeVec.WithLabelValues(record.labelValues...)
|
||||||
|
switch record.operation {
|
||||||
|
case VectorOperationAdd:
|
||||||
|
g.Add(record.value)
|
||||||
|
case VectorOperationSet:
|
||||||
|
g.Set(record.value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
v.records = nil
|
||||||
|
}
|
140
coderd/prometheusmetrics/collector_test.go
Normal file
140
coderd/prometheusmetrics/collector_test.go
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
package prometheusmetrics_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sort"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
dto "github.com/prometheus/client_model/go"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/coder/coder/coderd/prometheusmetrics"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCollector_Add(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// given
|
||||||
|
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "up",
|
||||||
|
Help: "The number of active agents per workspace.",
|
||||||
|
}, []string{"username", "workspace_name"}))
|
||||||
|
|
||||||
|
// when
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace")
|
||||||
|
agentsGauge.Commit()
|
||||||
|
|
||||||
|
// then
|
||||||
|
ch := make(chan prometheus.Metric, 2)
|
||||||
|
agentsGauge.Collect(ch)
|
||||||
|
|
||||||
|
metrics := collectAndSortMetrics(t, agentsGauge, 2)
|
||||||
|
|
||||||
|
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
|
||||||
|
|
||||||
|
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_Set(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// given
|
||||||
|
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "up",
|
||||||
|
Help: "The number of active agents per workspace.",
|
||||||
|
}, []string{"username", "workspace_name"}))
|
||||||
|
|
||||||
|
// when
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace")
|
||||||
|
agentsGauge.Commit()
|
||||||
|
|
||||||
|
// then
|
||||||
|
ch := make(chan prometheus.Metric, 2)
|
||||||
|
agentsGauge.Collect(ch)
|
||||||
|
|
||||||
|
metrics := collectAndSortMetrics(t, agentsGauge, 2)
|
||||||
|
|
||||||
|
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value
|
||||||
|
|
||||||
|
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_Set_Add(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// given
|
||||||
|
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "up",
|
||||||
|
Help: "The number of active agents per workspace.",
|
||||||
|
}, []string{"username", "workspace_name"}))
|
||||||
|
|
||||||
|
// when
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace")
|
||||||
|
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace")
|
||||||
|
agentsGauge.Commit()
|
||||||
|
|
||||||
|
// then
|
||||||
|
ch := make(chan prometheus.Metric, 2)
|
||||||
|
agentsGauge.Collect(ch)
|
||||||
|
|
||||||
|
metrics := collectAndSortMetrics(t, agentsGauge, 2)
|
||||||
|
|
||||||
|
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
|
||||||
|
|
||||||
|
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric {
|
||||||
|
ch := make(chan prometheus.Metric, count)
|
||||||
|
defer close(ch)
|
||||||
|
|
||||||
|
var metrics []dto.Metric
|
||||||
|
|
||||||
|
collector.Collect(ch)
|
||||||
|
for i := 0; i < count; i++ {
|
||||||
|
m := <-ch
|
||||||
|
|
||||||
|
var metric dto.Metric
|
||||||
|
err := m.Write(&metric)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
metrics = append(metrics, metric)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure always the same order of metrics
|
||||||
|
sort.Slice(metrics, func(i, j int) bool {
|
||||||
|
return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()})
|
||||||
|
})
|
||||||
|
return metrics
|
||||||
|
}
|
@ -2,13 +2,24 @@ package prometheusmetrics
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"tailscale.com/tailcfg"
|
||||||
|
|
||||||
|
"cdr.dev/slog"
|
||||||
|
|
||||||
"github.com/coder/coder/coderd"
|
"github.com/coder/coder/coderd"
|
||||||
"github.com/coder/coder/coderd/database"
|
"github.com/coder/coder/coderd/database"
|
||||||
|
"github.com/coder/coder/coderd/database/dbauthz"
|
||||||
|
"github.com/coder/coder/tailnet"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ActiveUsers tracks the number of users that have authenticated within the past hour.
|
// ActiveUsers tracks the number of users that have authenticated within the past hour.
|
||||||
@ -106,3 +117,175 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
|
|||||||
}()
|
}()
|
||||||
return cancelFunc, nil
|
return cancelFunc, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Agents tracks the total number of workspaces with labels on status.
|
||||||
|
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) {
|
||||||
|
if duration == 0 {
|
||||||
|
duration = 1 * time.Minute
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "up",
|
||||||
|
Help: "The number of active agents per workspace.",
|
||||||
|
}, []string{"username", "workspace_name"}))
|
||||||
|
err := registerer.Register(agentsGauge)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "connections",
|
||||||
|
Help: "Agent connections with statuses.",
|
||||||
|
}, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}))
|
||||||
|
err = registerer.Register(agentsConnectionsGauge)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "connection_latencies_seconds",
|
||||||
|
Help: "Agent connection latencies in seconds.",
|
||||||
|
}, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}))
|
||||||
|
err = registerer.Register(agentsConnectionLatenciesGauge)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agents",
|
||||||
|
Name: "apps",
|
||||||
|
Help: "Agent applications with statuses.",
|
||||||
|
}, []string{"agent_name", "username", "workspace_name", "app_name", "health"}))
|
||||||
|
err = registerer.Register(agentsAppsGauge)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "prometheusmetrics",
|
||||||
|
Name: "agents_execution_seconds",
|
||||||
|
Help: "Histogram for duration of agents metrics collection in seconds.",
|
||||||
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
||||||
|
})
|
||||||
|
err = registerer.Register(metricsCollectorAgents)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// nolint:gocritic // Prometheus must collect metrics for all Coder users.
|
||||||
|
ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx))
|
||||||
|
ticker := time.NewTicker(duration)
|
||||||
|
go func() {
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Debug(ctx, "Agent metrics collection is starting")
|
||||||
|
timer := prometheus.NewTimer(metricsCollectorAgents)
|
||||||
|
|
||||||
|
workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
|
||||||
|
AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()),
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
logger.Error(ctx, "can't get workspace rows", slog.Error(err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, workspace := range workspaceRows {
|
||||||
|
user, err := db.GetUserByID(ctx, workspace.OwnerID)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err))
|
||||||
|
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err))
|
||||||
|
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(agents) == 0 {
|
||||||
|
logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID))
|
||||||
|
agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, agent := range agents {
|
||||||
|
// Collect information about agents
|
||||||
|
agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name)
|
||||||
|
|
||||||
|
connectionStatus := agent.Status(agentInactiveDisconnectTimeout)
|
||||||
|
node := (*coordinator.Load()).Node(agent.ID)
|
||||||
|
|
||||||
|
tailnetNode := "unknown"
|
||||||
|
if node != nil {
|
||||||
|
tailnetNode = node.ID.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode)
|
||||||
|
|
||||||
|
if node == nil {
|
||||||
|
logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID))
|
||||||
|
} else {
|
||||||
|
// Collect information about connection latencies
|
||||||
|
for rawRegion, latency := range node.DERPLatency {
|
||||||
|
regionParts := strings.SplitN(rawRegion, "-", 2)
|
||||||
|
regionID, err := strconv.Atoi(regionParts[0])
|
||||||
|
if err != nil {
|
||||||
|
logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
region, found := derpMap.Regions[regionID]
|
||||||
|
if !found {
|
||||||
|
// It's possible that a workspace agent is using an old DERPMap
|
||||||
|
// and reports regions that do not exist. If that's the case,
|
||||||
|
// report the region as unknown!
|
||||||
|
region = &tailcfg.DERPRegion{
|
||||||
|
RegionID: regionID,
|
||||||
|
RegionName: fmt.Sprintf("Unnamed %d", regionID),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect information about registered applications
|
||||||
|
apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID)
|
||||||
|
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||||
|
logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, app := range apps {
|
||||||
|
agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
agentsGauge.Commit()
|
||||||
|
agentsConnectionsGauge.Commit()
|
||||||
|
agentsConnectionLatenciesGauge.Commit()
|
||||||
|
agentsAppsGauge.Commit()
|
||||||
|
|
||||||
|
logger.Debug(ctx, "Agent metrics collection is done")
|
||||||
|
metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return cancelFunc, nil
|
||||||
|
}
|
||||||
|
@ -3,6 +3,7 @@ package prometheusmetrics_test
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"database/sql"
|
"database/sql"
|
||||||
|
"sync/atomic"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -11,11 +12,18 @@ import (
|
|||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"cdr.dev/slog/sloggers/slogtest"
|
||||||
|
|
||||||
|
"github.com/coder/coder/coderd/coderdtest"
|
||||||
"github.com/coder/coder/coderd/database"
|
"github.com/coder/coder/coderd/database"
|
||||||
"github.com/coder/coder/coderd/database/dbfake"
|
"github.com/coder/coder/coderd/database/dbfake"
|
||||||
"github.com/coder/coder/coderd/database/dbgen"
|
"github.com/coder/coder/coderd/database/dbgen"
|
||||||
"github.com/coder/coder/coderd/prometheusmetrics"
|
"github.com/coder/coder/coderd/prometheusmetrics"
|
||||||
"github.com/coder/coder/codersdk"
|
"github.com/coder/coder/codersdk"
|
||||||
|
"github.com/coder/coder/provisioner/echo"
|
||||||
|
"github.com/coder/coder/provisionersdk/proto"
|
||||||
|
"github.com/coder/coder/tailnet"
|
||||||
|
"github.com/coder/coder/tailnet/tailnettest"
|
||||||
"github.com/coder/coder/testutil"
|
"github.com/coder/coder/testutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -239,3 +247,108 @@ func TestWorkspaces(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgents(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Build a sample workspace with test agent and fake application
|
||||||
|
client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
|
||||||
|
db := api.Database
|
||||||
|
|
||||||
|
user := coderdtest.CreateFirstUser(t, client)
|
||||||
|
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
|
||||||
|
Parse: echo.ParseComplete,
|
||||||
|
ProvisionPlan: echo.ProvisionComplete,
|
||||||
|
ProvisionApply: []*proto.Provision_Response{{
|
||||||
|
Type: &proto.Provision_Response_Complete{
|
||||||
|
Complete: &proto.Provision_Complete{
|
||||||
|
Resources: []*proto.Resource{{
|
||||||
|
Name: "example",
|
||||||
|
Type: "aws_instance",
|
||||||
|
Agents: []*proto.Agent{{
|
||||||
|
Id: uuid.NewString(),
|
||||||
|
Name: "testagent",
|
||||||
|
Directory: t.TempDir(),
|
||||||
|
Auth: &proto.Agent_Token{
|
||||||
|
Token: uuid.NewString(),
|
||||||
|
},
|
||||||
|
Apps: []*proto.App{
|
||||||
|
{
|
||||||
|
Slug: "fake-app",
|
||||||
|
DisplayName: "Fake application",
|
||||||
|
SharingLevel: proto.AppSharingLevel_OWNER,
|
||||||
|
// Hopefully this IP and port doesn't exist.
|
||||||
|
Url: "http://127.1.0.1:65535",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
})
|
||||||
|
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
|
||||||
|
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
|
||||||
|
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
|
||||||
|
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)
|
||||||
|
|
||||||
|
// given
|
||||||
|
coordinator := tailnet.NewCoordinator()
|
||||||
|
coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{}
|
||||||
|
coordinatorPtr.Store(&coordinator)
|
||||||
|
derpMap := tailnettest.RunDERPAndSTUN(t)
|
||||||
|
agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests
|
||||||
|
registry := prometheus.NewRegistry()
|
||||||
|
|
||||||
|
// when
|
||||||
|
cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond)
|
||||||
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
|
// then
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var agentsUp bool
|
||||||
|
var agentsConnections bool
|
||||||
|
var agentsApps bool
|
||||||
|
var agentsExecutionInSeconds bool
|
||||||
|
require.Eventually(t, func() bool {
|
||||||
|
metrics, err := registry.Gather()
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
if len(metrics) < 1 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, metric := range metrics {
|
||||||
|
switch metric.GetName() {
|
||||||
|
case "coderd_agents_up":
|
||||||
|
assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username
|
||||||
|
assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
|
||||||
|
agentsUp = true
|
||||||
|
case "coderd_agents_connections":
|
||||||
|
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
|
||||||
|
assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state
|
||||||
|
assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status
|
||||||
|
assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node
|
||||||
|
assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username
|
||||||
|
assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
|
||||||
|
agentsConnections = true
|
||||||
|
case "coderd_agents_apps":
|
||||||
|
assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name
|
||||||
|
assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name
|
||||||
|
assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health
|
||||||
|
assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username
|
||||||
|
assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name
|
||||||
|
assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value
|
||||||
|
agentsApps = true
|
||||||
|
case "coderd_prometheusmetrics_agents_execution_seconds":
|
||||||
|
agentsExecutionInSeconds = true
|
||||||
|
default:
|
||||||
|
require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds
|
||||||
|
}, testutil.WaitShort, testutil.IntervalFast)
|
||||||
|
}
|
||||||
|
@ -30,7 +30,11 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically
|
|||||||
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
|
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
|
||||||
|
|
||||||
| Name | Type | Description | Labels |
|
| Name | Type | Description | Labels |
|
||||||
| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
|
| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
|
||||||
|
| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` |
|
||||||
|
| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` |
|
||||||
|
| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` |
|
||||||
|
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` |
|
||||||
| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | |
|
| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | |
|
||||||
| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | |
|
| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | |
|
||||||
| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | |
|
| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | |
|
||||||
@ -38,6 +42,7 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically
|
|||||||
| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` |
|
| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` |
|
||||||
| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` |
|
| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` |
|
||||||
| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` |
|
| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` |
|
||||||
|
| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | |
|
||||||
| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` |
|
| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` |
|
||||||
| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` |
|
| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` |
|
||||||
| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` |
|
| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` |
|
||||||
|
@ -1,3 +1,23 @@
|
|||||||
|
# HELP coderd_agents_apps Agent applications with statuses.
|
||||||
|
# TYPE coderd_agents_apps gauge
|
||||||
|
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1
|
||||||
|
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1
|
||||||
|
coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1
|
||||||
|
# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds.
|
||||||
|
# TYPE coderd_agents_connection_latencies_seconds gauge
|
||||||
|
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125
|
||||||
|
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416
|
||||||
|
coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416
|
||||||
|
# HELP coderd_agents_connections Agent connections with statuses.
|
||||||
|
# TYPE coderd_agents_connections gauge
|
||||||
|
coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1
|
||||||
|
coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1
|
||||||
|
coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1
|
||||||
|
# HELP coderd_agents_up The number of active agents per workspace.
|
||||||
|
# TYPE coderd_agents_up gauge
|
||||||
|
coderd_agents_up{username="admin",workspace_name="workspace-1"} 1
|
||||||
|
coderd_agents_up{username="admin",workspace_name="workspace-2"} 1
|
||||||
|
coderd_agents_up{username="admin",workspace_name="workspace-3"} 1
|
||||||
# HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds.
|
# HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds.
|
||||||
# TYPE coderd_api_websocket_durations_seconds histogram
|
# TYPE coderd_api_websocket_durations_seconds histogram
|
||||||
coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0
|
coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0
|
||||||
@ -568,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files
|
|||||||
# HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status.
|
# HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status.
|
||||||
# TYPE coderd_api_workspace_latest_build_total gauge
|
# TYPE coderd_api_workspace_latest_build_total gauge
|
||||||
coderd_api_workspace_latest_build_total{status="succeeded"} 1
|
coderd_api_workspace_latest_build_total{status="succeeded"} 1
|
||||||
|
# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds.
|
||||||
|
# TYPE coderd_metrics_collector_agents_execution_seconds histogram
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_sum 0.0592915
|
||||||
|
coderd_metrics_collector_agents_execution_seconds_count 2
|
||||||
# HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds.
|
# HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds.
|
||||||
# TYPE coderd_provisionerd_job_timings_seconds histogram
|
# TYPE coderd_provisionerd_job_timings_seconds histogram
|
||||||
coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0
|
coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0
|
||||||
|
Reference in New Issue
Block a user