mirror of
https://github.com/coder/coder.git
synced 2025-07-15 22:20:27 +00:00
feat: expose agent stats via Prometheus endpoint (#7115)
* WIP * WIP * WIP * Agents * fix * 1min * fix * WIP * Test * docs * fmt * Add timer to measure the metrics collection * Use CachedGaugeVec * Unit tests * WIP * WIP * db: GetWorkspaceAgentStatsAndLabels * fmt * WIP * gauges * feat: collect * fix * fmt * minor fixes * Prometheus flag * fix * WIP * fix tests * WIP * fix json * Rx Tx bytes * CloseFunc * fix * fix * Fixes * fix * fix: IgnoreErrors * Fix: Windows * fix * reflect.DeepEquals
This commit is contained in:
@ -302,6 +302,10 @@ func (q *querier) GetWorkspaceAgentStats(ctx context.Context, createdAfter time.
|
||||
return q.db.GetWorkspaceAgentStats(ctx, createdAfter)
|
||||
}
|
||||
|
||||
func (q *querier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) {
|
||||
return q.db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter)
|
||||
}
|
||||
|
||||
func (q *querier) GetDeploymentWorkspaceStats(ctx context.Context) (database.GetDeploymentWorkspaceStatsRow, error) {
|
||||
return q.db.GetDeploymentWorkspaceStats(ctx)
|
||||
}
|
||||
|
@ -3998,6 +3998,77 @@ func (q *fakeQuerier) GetWorkspaceAgentStats(_ context.Context, createdAfter tim
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) {
|
||||
q.mutex.RLock()
|
||||
defer q.mutex.RUnlock()
|
||||
|
||||
agentStatsCreatedAfter := make([]database.WorkspaceAgentStat, 0)
|
||||
latestAgentStats := map[uuid.UUID]database.WorkspaceAgentStat{}
|
||||
|
||||
for _, agentStat := range q.workspaceAgentStats {
|
||||
if agentStat.CreatedAt.After(createdAfter) {
|
||||
agentStatsCreatedAfter = append(agentStatsCreatedAfter, agentStat)
|
||||
latestAgentStats[agentStat.AgentID] = agentStat
|
||||
}
|
||||
}
|
||||
|
||||
statByAgent := map[uuid.UUID]database.GetWorkspaceAgentStatsAndLabelsRow{}
|
||||
|
||||
// Session and connection metrics
|
||||
for _, agentStat := range latestAgentStats {
|
||||
stat := statByAgent[agentStat.AgentID]
|
||||
stat.SessionCountVSCode += agentStat.SessionCountVSCode
|
||||
stat.SessionCountJetBrains += agentStat.SessionCountJetBrains
|
||||
stat.SessionCountReconnectingPTY += agentStat.SessionCountReconnectingPTY
|
||||
stat.SessionCountSSH += agentStat.SessionCountSSH
|
||||
stat.ConnectionCount += agentStat.ConnectionCount
|
||||
if agentStat.ConnectionMedianLatencyMS >= 0 && stat.ConnectionMedianLatencyMS < agentStat.ConnectionMedianLatencyMS {
|
||||
stat.ConnectionMedianLatencyMS = agentStat.ConnectionMedianLatencyMS
|
||||
}
|
||||
statByAgent[agentStat.AgentID] = stat
|
||||
}
|
||||
|
||||
// Tx, Rx metrics
|
||||
for _, agentStat := range agentStatsCreatedAfter {
|
||||
stat := statByAgent[agentStat.AgentID]
|
||||
stat.RxBytes += agentStat.RxBytes
|
||||
stat.TxBytes += agentStat.TxBytes
|
||||
statByAgent[agentStat.AgentID] = stat
|
||||
}
|
||||
|
||||
// Labels
|
||||
for _, agentStat := range agentStatsCreatedAfter {
|
||||
stat := statByAgent[agentStat.AgentID]
|
||||
|
||||
user, err := q.getUserByIDNoLock(agentStat.UserID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stat.Username = user.Username
|
||||
|
||||
workspace, err := q.GetWorkspaceByID(ctx, agentStat.WorkspaceID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stat.WorkspaceName = workspace.Name
|
||||
|
||||
agent, err := q.GetWorkspaceAgentByID(ctx, agentStat.AgentID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stat.AgentName = agent.Name
|
||||
|
||||
statByAgent[agentStat.AgentID] = stat
|
||||
}
|
||||
|
||||
stats := make([]database.GetWorkspaceAgentStatsAndLabelsRow, 0, len(statByAgent))
|
||||
for _, agent := range statByAgent {
|
||||
stats = append(stats, agent)
|
||||
}
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
|
||||
q.mutex.RLock()
|
||||
defer q.mutex.RUnlock()
|
||||
|
@ -130,6 +130,7 @@ type sqlcQuerier interface {
|
||||
GetWorkspaceAgentMetadata(ctx context.Context, workspaceAgentID uuid.UUID) ([]WorkspaceAgentMetadatum, error)
|
||||
GetWorkspaceAgentStartupLogsAfter(ctx context.Context, arg GetWorkspaceAgentStartupLogsAfterParams) ([]WorkspaceAgentStartupLog, error)
|
||||
GetWorkspaceAgentStats(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsRow, error)
|
||||
GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsAndLabelsRow, error)
|
||||
GetWorkspaceAgentsByResourceIDs(ctx context.Context, ids []uuid.UUID) ([]WorkspaceAgent, error)
|
||||
GetWorkspaceAgentsCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceAgent, error)
|
||||
GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) ([]WorkspaceAgent, error)
|
||||
|
@ -6374,6 +6374,108 @@ func (q *sqlQuerier) GetWorkspaceAgentStats(ctx context.Context, createdAt time.
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const getWorkspaceAgentStatsAndLabels = `-- name: GetWorkspaceAgentStatsAndLabels :many
|
||||
WITH agent_stats AS (
|
||||
SELECT
|
||||
user_id,
|
||||
agent_id,
|
||||
workspace_id,
|
||||
coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes,
|
||||
coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes
|
||||
FROM workspace_agent_stats
|
||||
WHERE workspace_agent_stats.created_at > $1
|
||||
GROUP BY user_id, agent_id, workspace_id
|
||||
), latest_agent_stats AS (
|
||||
SELECT
|
||||
a.agent_id,
|
||||
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
|
||||
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
|
||||
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
|
||||
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty,
|
||||
coalesce(SUM(connection_count), 0)::bigint AS connection_count,
|
||||
coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
|
||||
FROM (
|
||||
SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn
|
||||
FROM workspace_agent_stats
|
||||
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
|
||||
WHERE created_at > $1 AND connection_median_latency_ms > 0
|
||||
) AS a
|
||||
WHERE a.rn = 1
|
||||
GROUP BY a.user_id, a.agent_id, a.workspace_id
|
||||
)
|
||||
SELECT
|
||||
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes,
|
||||
session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty,
|
||||
connection_count, connection_median_latency_ms
|
||||
FROM
|
||||
agent_stats
|
||||
JOIN
|
||||
latest_agent_stats
|
||||
ON
|
||||
agent_stats.agent_id = latest_agent_stats.agent_id
|
||||
JOIN
|
||||
users
|
||||
ON
|
||||
users.id = agent_stats.user_id
|
||||
JOIN
|
||||
workspace_agents
|
||||
ON
|
||||
workspace_agents.id = agent_stats.agent_id
|
||||
JOIN
|
||||
workspaces
|
||||
ON
|
||||
workspaces.id = agent_stats.workspace_id
|
||||
`
|
||||
|
||||
type GetWorkspaceAgentStatsAndLabelsRow struct {
|
||||
Username string `db:"username" json:"username"`
|
||||
AgentName string `db:"agent_name" json:"agent_name"`
|
||||
WorkspaceName string `db:"workspace_name" json:"workspace_name"`
|
||||
RxBytes int64 `db:"rx_bytes" json:"rx_bytes"`
|
||||
TxBytes int64 `db:"tx_bytes" json:"tx_bytes"`
|
||||
SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"`
|
||||
SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"`
|
||||
SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"`
|
||||
SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"`
|
||||
ConnectionCount int64 `db:"connection_count" json:"connection_count"`
|
||||
ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"`
|
||||
}
|
||||
|
||||
func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsAndLabelsRow, error) {
|
||||
rows, err := q.db.QueryContext(ctx, getWorkspaceAgentStatsAndLabels, createdAt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []GetWorkspaceAgentStatsAndLabelsRow
|
||||
for rows.Next() {
|
||||
var i GetWorkspaceAgentStatsAndLabelsRow
|
||||
if err := rows.Scan(
|
||||
&i.Username,
|
||||
&i.AgentName,
|
||||
&i.WorkspaceName,
|
||||
&i.RxBytes,
|
||||
&i.TxBytes,
|
||||
&i.SessionCountVSCode,
|
||||
&i.SessionCountSSH,
|
||||
&i.SessionCountJetBrains,
|
||||
&i.SessionCountReconnectingPTY,
|
||||
&i.ConnectionCount,
|
||||
&i.ConnectionMedianLatencyMS,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const insertWorkspaceAgentStat = `-- name: InsertWorkspaceAgentStat :one
|
||||
INSERT INTO
|
||||
workspace_agent_stats (
|
||||
|
@ -103,3 +103,55 @@ WITH agent_stats AS (
|
||||
) AS a WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id
|
||||
)
|
||||
SELECT * FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id;
|
||||
|
||||
-- name: GetWorkspaceAgentStatsAndLabels :many
|
||||
WITH agent_stats AS (
|
||||
SELECT
|
||||
user_id,
|
||||
agent_id,
|
||||
workspace_id,
|
||||
coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes,
|
||||
coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes
|
||||
FROM workspace_agent_stats
|
||||
WHERE workspace_agent_stats.created_at > $1
|
||||
GROUP BY user_id, agent_id, workspace_id
|
||||
), latest_agent_stats AS (
|
||||
SELECT
|
||||
a.agent_id,
|
||||
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
|
||||
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
|
||||
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
|
||||
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty,
|
||||
coalesce(SUM(connection_count), 0)::bigint AS connection_count,
|
||||
coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
|
||||
FROM (
|
||||
SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn
|
||||
FROM workspace_agent_stats
|
||||
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
|
||||
WHERE created_at > $1 AND connection_median_latency_ms > 0
|
||||
) AS a
|
||||
WHERE a.rn = 1
|
||||
GROUP BY a.user_id, a.agent_id, a.workspace_id
|
||||
)
|
||||
SELECT
|
||||
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes,
|
||||
session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty,
|
||||
connection_count, connection_median_latency_ms
|
||||
FROM
|
||||
agent_stats
|
||||
JOIN
|
||||
latest_agent_stats
|
||||
ON
|
||||
agent_stats.agent_id = latest_agent_stats.agent_id
|
||||
JOIN
|
||||
users
|
||||
ON
|
||||
users.id = agent_stats.user_id
|
||||
JOIN
|
||||
workspace_agents
|
||||
ON
|
||||
workspace_agents.id = agent_stats.agent_id
|
||||
JOIN
|
||||
workspaces
|
||||
ON
|
||||
workspaces.id = agent_stats.workspace_id;
|
||||
|
Reference in New Issue
Block a user