mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
feat: expose workspace statuses (with details) as a prometheus metric (#12762)
Implements #12462
This commit is contained in:
@ -24,10 +24,12 @@ import (
|
||||
"github.com/coder/coder/v2/tailnet"
|
||||
)
|
||||
|
||||
const defaultRefreshRate = time.Minute
|
||||
|
||||
// ActiveUsers tracks the number of users that have authenticated within the past hour.
|
||||
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
||||
if duration == 0 {
|
||||
duration = 5 * time.Minute
|
||||
duration = defaultRefreshRate
|
||||
}
|
||||
|
||||
gauge := prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
@ -72,36 +74,42 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
|
||||
}
|
||||
|
||||
// Workspaces tracks the total number of workspaces with labels on status.
|
||||
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
||||
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
|
||||
if duration == 0 {
|
||||
duration = 5 * time.Minute
|
||||
duration = defaultRefreshRate
|
||||
}
|
||||
|
||||
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "coderd",
|
||||
Subsystem: "api",
|
||||
Name: "workspace_latest_build_total",
|
||||
Help: "The latest workspace builds with a status.",
|
||||
Help: "The current number of workspace builds by status.",
|
||||
}, []string{"status"})
|
||||
err := registerer.Register(gauge)
|
||||
if err != nil {
|
||||
if err := registerer.Register(workspaceLatestBuildTotals); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "coderd",
|
||||
Name: "workspace_latest_build_status",
|
||||
Help: "The current workspace statuses by template, transition, and owner.",
|
||||
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
|
||||
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// This exists so the prometheus metric exports immediately when set.
|
||||
// It helps with tests so they don't have to wait for a tick.
|
||||
gauge.WithLabelValues("pending").Set(0)
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(ctx)
|
||||
done := make(chan struct{})
|
||||
|
||||
// Use time.Nanosecond to force an initial tick. It will be reset to the
|
||||
// correct duration after executing once.
|
||||
ticker := time.NewTicker(time.Nanosecond)
|
||||
doTick := func() {
|
||||
defer ticker.Reset(duration)
|
||||
|
||||
updateWorkspaceTotals := func() {
|
||||
builds, err := db.GetLatestWorkspaceBuilds(ctx)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
// clear all series if there are no database entries
|
||||
workspaceLatestBuildTotals.Reset()
|
||||
}
|
||||
|
||||
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
|
||||
return
|
||||
}
|
||||
jobIDs := make([]uuid.UUID, 0, len(builds))
|
||||
@ -110,16 +118,53 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
|
||||
}
|
||||
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs)
|
||||
if err != nil {
|
||||
ids := make([]string, 0, len(jobIDs))
|
||||
for _, id := range jobIDs {
|
||||
ids = append(ids, id.String())
|
||||
}
|
||||
|
||||
logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
gauge.Reset()
|
||||
workspaceLatestBuildTotals.Reset()
|
||||
for _, job := range jobs {
|
||||
status := codersdk.ProvisionerJobStatus(job.JobStatus)
|
||||
gauge.WithLabelValues(string(status)).Add(1)
|
||||
workspaceLatestBuildTotals.WithLabelValues(string(status)).Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
updateWorkspaceStatuses := func() {
|
||||
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
|
||||
Deleted: false,
|
||||
WithSummary: false,
|
||||
})
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
// clear all series if there are no database entries
|
||||
workspaceLatestBuildStatuses.Reset()
|
||||
}
|
||||
|
||||
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
|
||||
return
|
||||
}
|
||||
|
||||
workspaceLatestBuildStatuses.Reset()
|
||||
for _, w := range ws {
|
||||
workspaceLatestBuildStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Use time.Nanosecond to force an initial tick. It will be reset to the
|
||||
// correct duration after executing once.
|
||||
ticker := time.NewTicker(time.Nanosecond)
|
||||
doTick := func() {
|
||||
defer ticker.Reset(duration)
|
||||
|
||||
updateWorkspaceTotals()
|
||||
updateWorkspaceStatuses()
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer close(done)
|
||||
defer ticker.Stop()
|
||||
@ -141,7 +186,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
|
||||
// Agents tracks the total number of workspaces with labels on status.
|
||||
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
|
||||
if duration == 0 {
|
||||
duration = 1 * time.Minute
|
||||
duration = defaultRefreshRate
|
||||
}
|
||||
|
||||
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
@ -330,7 +375,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
|
||||
|
||||
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
|
||||
if duration == 0 {
|
||||
duration = 1 * time.Minute
|
||||
duration = defaultRefreshRate
|
||||
}
|
||||
|
||||
if len(aggregateByLabels) == 0 {
|
||||
|
Reference in New Issue
Block a user