1
0
mirror of https://github.com/coder/coder.git synced 2025-03-15 19:19:58 +00:00

feat: expose current agent connections by type via prometheus ()

This commit is contained in:
Ethan
2024-09-11 14:13:30 +10:00
committed by GitHub
parent 40688e40df
commit c8580a415a
5 changed files with 82 additions and 43 deletions
agent
docs/admin
scripts/metricsdocgen

@ -1510,6 +1510,8 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
var mu sync.Mutex
status := a.network.Status()
durations := []float64{}
p2pConns := 0
derpConns := 0
pingCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
defer cancelFunc()
for nodeID, peer := range status.Peer {
@ -1526,13 +1528,18 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
wg.Add(1)
go func() {
defer wg.Done()
duration, _, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
duration, p2p, _, err := a.network.Ping(pingCtx, addresses[0].Addr())
if err != nil {
return
}
mu.Lock()
defer mu.Unlock()
durations = append(durations, float64(duration.Microseconds()))
if p2p {
p2pConns++
} else {
derpConns++
}
}()
}
wg.Wait()
@ -1552,6 +1559,9 @@ func (a *agent) Collect(ctx context.Context, networkStats map[netlogtype.Connect
// Agent metrics are changing all the time, so there is no need to perform
// reflect.DeepEqual to see if stats should be transferred.
// currentConnections behaves like a hypothetical `GaugeFuncVec` and is only set at collection time.
a.metrics.currentConnections.WithLabelValues("p2p").Set(float64(p2pConns))
a.metrics.currentConnections.WithLabelValues("derp").Set(float64(derpConns))
metricsCtx, cancelFunc := context.WithTimeout(ctx, 5*time.Second)
defer cancelFunc()
a.logger.Debug(ctx, "collecting agent metrics for stats")

@ -2531,17 +2531,17 @@ func TestAgent_Metrics_SSH(t *testing.T) {
err = session.Shell()
require.NoError(t, err)
expected := []agentsdk.AgentMetric{
expected := []*proto.Stats_Metric{
{
Name: "agent_reconnecting_pty_connections_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "agent_sessions_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 1,
Labels: []agentsdk.AgentMetricLabel{
Labels: []*proto.Stats_Metric_Label{
{
Name: "magic_type",
Value: "ssh",
@ -2554,30 +2554,46 @@ func TestAgent_Metrics_SSH(t *testing.T) {
},
{
Name: "agent_ssh_server_failed_connections_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "agent_ssh_server_sftp_connections_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "agent_ssh_server_sftp_server_errors_total",
Type: agentsdk.AgentMetricTypeCounter,
Type: proto.Stats_Metric_COUNTER,
Value: 0,
},
{
Name: "coderd_agentstats_startup_script_seconds",
Type: agentsdk.AgentMetricTypeGauge,
Name: "coderd_agentstats_currently_reachable_peers",
Type: proto.Stats_Metric_GAUGE,
Value: 0,
Labels: []agentsdk.AgentMetricLabel{
Labels: []*proto.Stats_Metric_Label{
{
Name: "success",
Value: "true",
Name: "connection_type",
Value: "derp",
},
},
},
{
Name: "coderd_agentstats_currently_reachable_peers",
Type: proto.Stats_Metric_GAUGE,
Value: 1,
Labels: []*proto.Stats_Metric_Label{
{
Name: "connection_type",
Value: "p2p",
},
},
},
{
Name: "coderd_agentstats_startup_script_seconds",
Type: proto.Stats_Metric_GAUGE,
Value: 1,
},
}
var actual []*promgo.MetricFamily
@ -2586,17 +2602,33 @@ func TestAgent_Metrics_SSH(t *testing.T) {
if err != nil {
return false
}
if len(expected) != len(actual) {
return false
count := 0
for _, m := range actual {
count += len(m.GetMetric())
}
return verifyCollectedMetrics(t, expected, actual)
return count == len(expected)
}, testutil.WaitLong, testutil.IntervalFast)
require.Len(t, actual, len(expected))
collected := verifyCollectedMetrics(t, expected, actual)
require.True(t, collected, "expected metrics were not collected")
i := 0
for _, mf := range actual {
for _, m := range mf.GetMetric() {
assert.Equal(t, expected[i].Name, mf.GetName())
assert.Equal(t, expected[i].Type.String(), mf.GetType().String())
// Value is max expected
if expected[i].Type == proto.Stats_Metric_GAUGE {
assert.GreaterOrEqualf(t, expected[i].Value, m.GetGauge().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetGauge().GetValue())
} else if expected[i].Type == proto.Stats_Metric_COUNTER {
assert.GreaterOrEqualf(t, expected[i].Value, m.GetCounter().GetValue(), "expected %s to be greater than or equal to %f, got %f", expected[i].Name, expected[i].Value, m.GetCounter().GetValue())
}
for j, lbl := range expected[i].Labels {
assert.Equal(t, m.GetLabel()[j], &promgo.LabelPair{
Name: &lbl.Name,
Value: &lbl.Value,
})
}
i++
}
}
_ = stdin.Close()
err = session.Wait()
@ -2828,28 +2860,6 @@ func TestAgent_ManageProcessPriority(t *testing.T) {
})
}
func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actual []*promgo.MetricFamily) bool {
t.Helper()
for i, e := range expected {
assert.Equal(t, e.Name, actual[i].GetName())
assert.Equal(t, string(e.Type), strings.ToLower(actual[i].GetType().String()))
for _, m := range actual[i].GetMetric() {
assert.Equal(t, e.Value, m.Counter.GetValue())
if len(m.GetLabel()) > 0 {
for j, lbl := range m.GetLabel() {
assert.Equal(t, e.Labels[j].Name, lbl.GetName())
assert.Equal(t, e.Labels[j].Value, lbl.GetValue())
}
}
m.GetLabel()
}
}
return true
}
type syncWriter struct {
mu sync.Mutex
w io.Writer

@ -19,6 +19,7 @@ type agentMetrics struct {
// startupScriptSeconds is the time in seconds that the start script(s)
// took to run. This is reported once per agent.
startupScriptSeconds *prometheus.GaugeVec
currentConnections *prometheus.GaugeVec
}
func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
@ -45,10 +46,19 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics {
}, []string{"success"})
registerer.MustRegister(startupScriptSeconds)
currentConnections := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "agentstats",
Name: "currently_reachable_peers",
Help: "The number of peers (e.g. clients) that are currently reachable over the encrypted network.",
}, []string{"connection_type"})
registerer.MustRegister(currentConnections)
return &agentMetrics{
connectionsTotal: connectionsTotal,
reconnectingPTYErrors: reconnectingPTYErrors,
startupScriptSeconds: startupScriptSeconds,
currentConnections: currentConnections,
}
}

@ -96,6 +96,11 @@ spec:
## Available metrics
`coderd_agentstats_*` metrics must first be enabled with the flag
`--prometheus-collect-agent-stats`, or the environment variable
`CODER_PROMETHEUS_COLLECT_AGENT_STATS` before they can be retrieved from the
deployment. They will always be available from the agent.
<!-- Code generated by 'make docs/admin/prometheus.md'. DO NOT EDIT -->
| Name | Type | Description | Labels |
@ -107,6 +112,7 @@ spec:
| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` |
| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_currently_reachable_peers` | gauge | The number of peers (e.g. clients) that are currently reachable over the encrypted network. | `agent_name` `connection_type` `template_name` `username` `workspace_name` |
| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` |
| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` |

@ -63,6 +63,9 @@ coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_
# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency
# TYPE coderd_agentstats_connection_median_latency_seconds gauge
coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784
# HELP coderd_agentstats_currently_reachable_peers The number of peers (e.g. clients) that are currently reachable over the encrypted network.
# TYPE coderd_agentstats_currently_reachable_peers gauge
coderd_agentstats_currently_reachable_peers{agent_name="main",connection_type="derp",template_name="docker",username="admin",workspace_name="workspace1"} 0
# HELP coderd_agentstats_rx_bytes Agent Rx bytes
# TYPE coderd_agentstats_rx_bytes gauge
coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731