mirror of
https://github.com/coder/coder.git
synced 2025-07-13 21:36:50 +00:00
feat: expose current agent connections by type via prometheus (#14612)
This commit is contained in:
@ -1510,6 +1510,8 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
|
|||||||
var mu sync.Mutex
|
var mu sync.Mutex
|
||||||
status := a.network.Status()
|
status := a.network.Status()
|
||||||
durations := []float64{}
|
durations := []float64{}
|
||||||
|
p2pConns := 0
|
||||||
|
derpConns := 0
|
||||||
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
||||||
defer cancelFunc()
|
defer cancelFunc()
|
||||||
for nodeID, peer := range status.Peer {
|
for nodeID, peer := range status.Peer {
|
||||||
@ -1526,13 +1528,18 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
|
duration, p2p, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
defer mu.Unlock()
|
defer mu.Unlock()
|
||||||
durations = append(durations, float64(duration.Microseconds()))
|
durations = append(durations, float64(duration.Microseconds()))
|
||||||
|
if p2p {
|
||||||
|
p2pConns++
|
||||||
|
} else {
|
||||||
|
derpConns++
|
||||||
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
@ -1552,6 +1559,9 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
|
|||||||
// Agent metrics are changing all the time, so there is no need to perform
|
// Agent metrics are changing all the time, so there is no need to perform
|
||||||
// reflect.DeepEqual to see if stats should be transferred.
|
// reflect.DeepEqual to see if stats should be transferred.
|
||||||
|
|
||||||
|
// currentConnections behaves like a hypothetical `GaugeFuncVec` and is only set at collection time.
|
||||||
|
a.metrics.currentConnections.WithLabelValues("p2p").Set(float64(p2pConns))
|
||||||
|
a.metrics.currentConnections.WithLabelValues("derp").Set(float64(derpConns))
|
||||||
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
|
||||||
defer cancelFunc()
|
defer cancelFunc()
|
||||||
a.logger.Debug(ctx, "collecting agent metrics for stats")
|
a.logger.Debug(ctx, "collecting agent metrics for stats")
|
||||||
|
@ -2531,17 +2531,17 @@ func TestAgent_Metrics_SSH(t *testing.T) {
|
|||||||
err = session.Shell()
|
err = session.Shell()
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
expected := []agentsdk.AgentMetric{
|
expected := []*proto.Stats_Metric{
|
||||||
{
|
{
|
||||||
Name: "agent_reconnecting_pty_connections_total",
|
Name: "agent_reconnecting_pty_connections_total",
|
||||||
Type: agentsdk.AgentMetricTypeCounter,
|
Type: proto.Stats_Metric_COUNTER,
|
||||||
Value: 0,
|
Value: 0,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "agent_sessions_total",
|
Name: "agent_sessions_total",
|
||||||
Type: agentsdk.AgentMetricTypeCounter,
|
Type: proto.Stats_Metric_COUNTER,
|
||||||
Value: 1,
|
Value: 1,
|
||||||
Labels: []agentsdk.AgentMetricLabel{
|
Labels: []*proto.Stats_Metric_Label{
|
||||||
{
|
{
|
||||||
Name: "magic_type",
|
Name: "magic_type",
|
||||||
Value: "ssh",
|
Value: "ssh",
|
||||||
@ -2554,29 +2554,45 @@ func TestAgent_Metrics_SSH(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "agent_ssh_server_failed_connections_total",
|
Name: "agent_ssh_server_failed_connections_total",
|
||||||
Type: agentsdk.AgentMetricTypeCounter,
|
Type: proto.Stats_Metric_COUNTER,
|
||||||
Value: 0,
|
Value: 0,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "agent_ssh_server_sftp_connections_total",
|
Name: "agent_ssh_server_sftp_connections_total",
|
||||||
Type: agentsdk.AgentMetricTypeCounter,
|
Type: proto.Stats_Metric_COUNTER,
|
||||||
Value: 0,
|
Value: 0,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "agent_ssh_server_sftp_server_errors_total",
|
Name: "agent_ssh_server_sftp_server_errors_total",
|
||||||
Type: agentsdk.AgentMetricTypeCounter,
|
Type: proto.Stats_Metric_COUNTER,
|
||||||
Value: 0,
|
Value: 0,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Name: "coderd_agentstats_currently_reachable_peers",
|
||||||
|
Type: proto.Stats_Metric_GAUGE,
|
||||||
|
Value: 0,
|
||||||
|
Labels: []*proto.Stats_Metric_Label{
|
||||||
|
{
|
||||||
|
Name: "connection_type",
|
||||||
|
Value: "derp",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "coderd_agentstats_currently_reachable_peers",
|
||||||
|
Type: proto.Stats_Metric_GAUGE,
|
||||||
|
Value: 1,
|
||||||
|
Labels: []*proto.Stats_Metric_Label{
|
||||||
|
{
|
||||||
|
Name: "connection_type",
|
||||||
|
Value: "p2p",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: "coderd_agentstats_startup_script_seconds",
|
Name: "coderd_agentstats_startup_script_seconds",
|
||||||
Type: agentsdk.AgentMetricTypeGauge,
|
Type: proto.Stats_Metric_GAUGE,
|
||||||
Value: 0,
|
Value: 1,
|
||||||
Labels: []agentsdk.AgentMetricLabel{
|
|
||||||
{
|
|
||||||
Name: "success",
|
|
||||||
Value: "true",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2586,17 +2602,33 @@ func TestAgent_Metrics_SSH(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
count := 0
|
||||||
if len(expected) != len(actual) {
|
for _, m := range actual {
|
||||||
return false
|
count += len(m.GetMetric())
|
||||||
}
|
}
|
||||||
|
return count == len(expected)
|
||||||
return verifyCollectedMetrics(t, expected, actual)
|
|
||||||
}, testutil.WaitLong, testutil.IntervalFast)
|
}, testutil.WaitLong, testutil.IntervalFast)
|
||||||
|
|
||||||
require.Len(t, actual, len(expected))
|
i := 0
|
||||||
collected := verifyCollectedMetrics(t, expected, actual)
|
for _, mf := range actual {
|
||||||
require.True(t, collected, "expected metrics were not collected")
|
for _, m := range mf.GetMetric() {
|
||||||
|
assert.Equal(t, expected[i].Name, mf.GetName())
|
||||||
|
assert.Equal(t, expected[i].Type.String(), mf.GetType().String())
|
||||||
|
// Value is max expected
|
||||||
|
if expected[i].Type == proto.Stats_Metric_GAUGE {
|
||||||
|
assert.GreaterOrEqualf(t, expected[i].Value, m.GetGauge().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetGauge().GetValue())
|
||||||
|
} else if expected[i].Type == proto.Stats_Metric_COUNTER {
|
||||||
|
assert.GreaterOrEqualf(t, expected[i].Value, m.GetCounter().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetCounter().GetValue())
|
||||||
|
}
|
||||||
|
for j, lbl := range expected[i].Labels {
|
||||||
|
assert.Equal(t, m.GetLabel()[j], &promgo.LabelPair{
|
||||||
|
Name: &lbl.Name,
|
||||||
|
Value: &lbl.Value,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_ = stdin.Close()
|
_ = stdin.Close()
|
||||||
err = session.Wait()
|
err = session.Wait()
|
||||||
@ -2828,28 +2860,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool {
|
|
||||||
t.Helper()
|
|
||||||
|
|
||||||
for i, e := range expected {
|
|
||||||
assert.Equal(t, e.Name, actual[i].GetName())
|
|
||||||
assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String()))
|
|
||||||
|
|
||||||
for _, m := range actual[i].GetMetric() {
|
|
||||||
assert.Equal(t, e.Value, m.Counter.GetValue())
|
|
||||||
|
|
||||||
if len(m.GetLabel()) > 0 {
|
|
||||||
for j, lbl := range m.GetLabel() {
|
|
||||||
assert.Equal(t, e.Labels[j].Name, lbl.GetName())
|
|
||||||
assert.Equal(t, e.Labels[j].Value, lbl.GetValue())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m.GetLabel()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
type syncWriter struct {
|
type syncWriter struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
w io.Writer
|
w io.Writer
|
||||||
|
@ -19,6 +19,7 @@ type agentMetrics struct {
|
|||||||
// startupScriptSeconds is the time in seconds that the start script(s)
|
// startupScriptSeconds is the time in seconds that the start script(s)
|
||||||
// took to run. This is reported once per agent.
|
// took to run. This is reported once per agent.
|
||||||
startupScriptSeconds *prometheus.GaugeVec
|
startupScriptSeconds *prometheus.GaugeVec
|
||||||
|
currentConnections *prometheus.GaugeVec
|
||||||
}
|
}
|
||||||
|
|
||||||
func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
|
func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
|
||||||
@ -45,10 +46,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
|
|||||||
}, []string{"success"})
|
}, []string{"success"})
|
||||||
registerer.MustRegister(startupScriptSeconds)
|
registerer.MustRegister(startupScriptSeconds)
|
||||||
|
|
||||||
|
currentConnections := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "coderd",
|
||||||
|
Subsystem: "agentstats",
|
||||||
|
Name: "currently_reachable_peers",
|
||||||
|
Help: "The number of peers (e.g. clients) that are currently reachable over the encrypted network.",
|
||||||
|
}, []string{"connection_type"})
|
||||||
|
registerer.MustRegister(currentConnections)
|
||||||
|
|
||||||
return &agentMetrics{
|
return &agentMetrics{
|
||||||
connectionsTotal: connectionsTotal,
|
connectionsTotal: connectionsTotal,
|
||||||
reconnectingPTYErrors: reconnectingPTYErrors,
|
reconnectingPTYErrors: reconnectingPTYErrors,
|
||||||
startupScriptSeconds: startupScriptSeconds,
|
startupScriptSeconds: startupScriptSeconds,
|
||||||
|
currentConnections: currentConnections,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,6 +96,11 @@ spec:
|
|||||||
|
|
||||||
## Available metrics
|
## Available metrics
|
||||||
|
|
||||||
|
`coderd_agentstats_*` metrics must first be enabled with the flag
|
||||||
|
`--prometheus-collect-agent-stats`, or the environment variable
|
||||||
|
`CODER_PROMETHEUS_COLLECT_AGENT_STATS` before they can be retrieved from the
|
||||||
|
deployment. They will always be available from the agent.
|
||||||
|
|
||||||
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
|
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
|
||||||
|
|
||||||
| Name | Type | Description | Labels |
|
| Name | Type | Description | Labels |
|
||||||
@ -107,6 +112,7 @@ spec:
|
|||||||
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` |
|
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` |
|
||||||
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
|
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
|
||||||
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
|
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
|
||||||
|
| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` |
|
||||||
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
|
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
|
||||||
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
|
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
|
||||||
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |
|
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |
|
||||||
|
@ -63,6 +63,9 @@ coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_
|
|||||||
# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency
|
# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency
|
||||||
# TYPE coderd_agentstats_connection_median_latency_seconds gauge
|
# TYPE coderd_agentstats_connection_median_latency_seconds gauge
|
||||||
coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
|
coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
|
||||||
|
# HELP coderd_agentstats_currently_reachable_peers The number of peers (e.g. clients) that are currently reachable over the encrypted network.
|
||||||
|
# TYPE coderd_agentstats_currently_reachable_peers gauge
|
||||||
|
coderd_agentstats_currently_reachable_peers{agent_name="main",connection_type="derp",template_name="docker",username="admin",workspace_name="workspace1"} 0
|
||||||
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
|
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
|
||||||
# TYPE coderd_agentstats_rx_bytes gauge
|
# TYPE coderd_agentstats_rx_bytes gauge
|
||||||
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731
|
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731
|
||||||
|
Reference in New Issue
Block a user