mirror of
https://github.com/coder/coder.git
synced 2025-03-15 19:19:58 +00:00
feat: expose current agent connections by type via prometheus (#14612)
This commit is contained in:
@ -1510,6 +1510,8 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
|
||||
var mu sync.Mutex
|
||||
status := a.network.Status()
|
||||
durations := []float64{}
|
||||
p2pConns := 0
|
||||
derpConns := 0
|
||||
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancelFunc()
|
||||
for nodeID, peer := range status.Peer {
|
||||
@ -1526,13 +1528,18 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
|
||||
duration, p2p, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
durations = append(durations, float64(duration.Microseconds()))
|
||||
if p2p {
|
||||
p2pConns++
|
||||
} else {
|
||||
derpConns++
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
@ -1552,6 +1559,9 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
|
||||
// Agent metrics are changing all the time, so there is no need to perform
|
||||
// reflect.DeepEqual to see if stats should be transferred.
|
||||
|
||||
// currentConnections behaves like a hypothetical `GaugeFuncVec` and is only set at collection time.
|
||||
a.metrics.currentConnections.WithLabelValues("p2p").Set(float64(p2pConns))
|
||||
a.metrics.currentConnections.WithLabelValues("derp").Set(float64(derpConns))
|
||||
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancelFunc()
|
||||
a.logger.Debug(ctx, "collecting agent metrics for stats")
|
||||
|
@ -2531,17 +2531,17 @@ func TestAgent_Metrics_SSH(t *testing.T) {
|
||||
err = session.Shell()
|
||||
require.NoError(t, err)
|
||||
|
||||
expected := []agentsdk.AgentMetric{
|
||||
expected := []*proto.Stats_Metric{
|
||||
{
|
||||
Name: "agent_reconnecting_pty_connections_total",
|
||||
Type: agentsdk.AgentMetricTypeCounter,
|
||||
Type: proto.Stats_Metric_COUNTER,
|
||||
Value: 0,
|
||||
},
|
||||
{
|
||||
Name: "agent_sessions_total",
|
||||
Type: agentsdk.AgentMetricTypeCounter,
|
||||
Type: proto.Stats_Metric_COUNTER,
|
||||
Value: 1,
|
||||
Labels: []agentsdk.AgentMetricLabel{
|
||||
Labels: []*proto.Stats_Metric_Label{
|
||||
{
|
||||
Name: "magic_type",
|
||||
Value: "ssh",
|
||||
@ -2554,30 +2554,46 @@ func TestAgent_Metrics_SSH(t *testing.T) {
|
||||
},
|
||||
{
|
||||
Name: "agent_ssh_server_failed_connections_total",
|
||||
Type: agentsdk.AgentMetricTypeCounter,
|
||||
Type: proto.Stats_Metric_COUNTER,
|
||||
Value: 0,
|
||||
},
|
||||
{
|
||||
Name: "agent_ssh_server_sftp_connections_total",
|
||||
Type: agentsdk.AgentMetricTypeCounter,
|
||||
Type: proto.Stats_Metric_COUNTER,
|
||||
Value: 0,
|
||||
},
|
||||
{
|
||||
Name: "agent_ssh_server_sftp_server_errors_total",
|
||||
Type: agentsdk.AgentMetricTypeCounter,
|
||||
Type: proto.Stats_Metric_COUNTER,
|
||||
Value: 0,
|
||||
},
|
||||
{
|
||||
Name: "coderd_agentstats_startup_script_seconds",
|
||||
Type: agentsdk.AgentMetricTypeGauge,
|
||||
Name: "coderd_agentstats_currently_reachable_peers",
|
||||
Type: proto.Stats_Metric_GAUGE,
|
||||
Value: 0,
|
||||
Labels: []agentsdk.AgentMetricLabel{
|
||||
Labels: []*proto.Stats_Metric_Label{
|
||||
{
|
||||
Name: "success",
|
||||
Value: "true",
|
||||
Name: "connection_type",
|
||||
Value: "derp",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "coderd_agentstats_currently_reachable_peers",
|
||||
Type: proto.Stats_Metric_GAUGE,
|
||||
Value: 1,
|
||||
Labels: []*proto.Stats_Metric_Label{
|
||||
{
|
||||
Name: "connection_type",
|
||||
Value: "p2p",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "coderd_agentstats_startup_script_seconds",
|
||||
Type: proto.Stats_Metric_GAUGE,
|
||||
Value: 1,
|
||||
},
|
||||
}
|
||||
|
||||
var actual []*promgo.MetricFamily
|
||||
@ -2586,17 +2602,33 @@ func TestAgent_Metrics_SSH(t *testing.T) {
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if len(expected) != len(actual) {
|
||||
return false
|
||||
count := 0
|
||||
for _, m := range actual {
|
||||
count += len(m.GetMetric())
|
||||
}
|
||||
|
||||
return verifyCollectedMetrics(t, expected, actual)
|
||||
return count == len(expected)
|
||||
}, testutil.WaitLong, testutil.IntervalFast)
|
||||
|
||||
require.Len(t, actual, len(expected))
|
||||
collected := verifyCollectedMetrics(t, expected, actual)
|
||||
require.True(t, collected, "expected metrics were not collected")
|
||||
i := 0
|
||||
for _, mf := range actual {
|
||||
for _, m := range mf.GetMetric() {
|
||||
assert.Equal(t, expected[i].Name, mf.GetName())
|
||||
assert.Equal(t, expected[i].Type.String(), mf.GetType().String())
|
||||
// Value is max expected
|
||||
if expected[i].Type == proto.Stats_Metric_GAUGE {
|
||||
assert.GreaterOrEqualf(t, expected[i].Value, m.GetGauge().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetGauge().GetValue())
|
||||
} else if expected[i].Type == proto.Stats_Metric_COUNTER {
|
||||
assert.GreaterOrEqualf(t, expected[i].Value, m.GetCounter().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetCounter().GetValue())
|
||||
}
|
||||
for j, lbl := range expected[i].Labels {
|
||||
assert.Equal(t, m.GetLabel()[j], &promgo.LabelPair{
|
||||
Name: &lbl.Name,
|
||||
Value: &lbl.Value,
|
||||
})
|
||||
}
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
_ = stdin.Close()
|
||||
err = session.Wait()
|
||||
@ -2828,28 +2860,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool {
|
||||
t.Helper()
|
||||
|
||||
for i, e := range expected {
|
||||
assert.Equal(t, e.Name, actual[i].GetName())
|
||||
assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String()))
|
||||
|
||||
for _, m := range actual[i].GetMetric() {
|
||||
assert.Equal(t, e.Value, m.Counter.GetValue())
|
||||
|
||||
if len(m.GetLabel()) > 0 {
|
||||
for j, lbl := range m.GetLabel() {
|
||||
assert.Equal(t, e.Labels[j].Name, lbl.GetName())
|
||||
assert.Equal(t, e.Labels[j].Value, lbl.GetValue())
|
||||
}
|
||||
}
|
||||
m.GetLabel()
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type syncWriter struct {
|
||||
mu sync.Mutex
|
||||
w io.Writer
|
||||
|
@ -19,6 +19,7 @@ type agentMetrics struct {
|
||||
// startupScriptSeconds is the time in seconds that the start script(s)
|
||||
// took to run. This is reported once per agent.
|
||||
startupScriptSeconds *prometheus.GaugeVec
|
||||
currentConnections *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
|
||||
@ -45,10 +46,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
|
||||
}, []string{"success"})
|
||||
registerer.MustRegister(startupScriptSeconds)
|
||||
|
||||
currentConnections := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "coderd",
|
||||
Subsystem: "agentstats",
|
||||
Name: "currently_reachable_peers",
|
||||
Help: "The number of peers (e.g. clients) that are currently reachable over the encrypted network.",
|
||||
}, []string{"connection_type"})
|
||||
registerer.MustRegister(currentConnections)
|
||||
|
||||
return &agentMetrics{
|
||||
connectionsTotal: connectionsTotal,
|
||||
reconnectingPTYErrors: reconnectingPTYErrors,
|
||||
startupScriptSeconds: startupScriptSeconds,
|
||||
currentConnections: currentConnections,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,6 +96,11 @@ spec:
|
||||
|
||||
## Available metrics
|
||||
|
||||
`coderd_agentstats_*` metrics must first be enabled with the flag
|
||||
`--prometheus-collect-agent-stats`, or the environment variable
|
||||
`CODER_PROMETHEUS_COLLECT_AGENT_STATS` before they can be retrieved from the
|
||||
deployment. They will always be available from the agent.
|
||||
|
||||
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
|
||||
|
||||
| Name | Type | Description | Labels |
|
||||
@ -107,6 +112,7 @@ spec:
|
||||
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
|
||||
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |
|
||||
|
@ -63,6 +63,9 @@ coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_
|
||||
# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency
|
||||
# TYPE coderd_agentstats_connection_median_latency_seconds gauge
|
||||
coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
|
||||
# HELP coderd_agentstats_currently_reachable_peers The number of peers (e.g. clients) that are currently reachable over the encrypted network.
|
||||
# TYPE coderd_agentstats_currently_reachable_peers gauge
|
||||
coderd_agentstats_currently_reachable_peers{agent_name="main",connection_type="derp",template_name="docker",username="admin",workspace_name="workspace1"} 0
|
||||
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
|
||||
# TYPE coderd_agentstats_rx_bytes gauge
|
||||
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731
|
||||
|
Reference in New Issue
Block a user