feat: expose workspace statuses (with details) as a prometheus metric (#12762)

Implements #12462
This commit is contained in:
Danny Kopping
2024-04-02 09:57:36 +02:00
committed by GitHub
parent 114830de26
commit 79fb8e43c5
8 changed files with 375 additions and 161 deletions

View File

@ -24,10 +24,12 @@ import (
"github.com/coder/coder/v2/tailnet"
)
const defaultRefreshRate = time.Minute
// ActiveUsers tracks the number of users that have authenticated within the past hour.
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = 5 * time.Minute
duration = defaultRefreshRate
}
gauge := prometheus.NewGauge(prometheus.GaugeOpts{
@ -72,36 +74,42 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
}
// Workspaces tracks the total number of workspaces with labels on status.
func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = 5 * time.Minute
duration = defaultRefreshRate
}
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
workspaceLatestBuildTotals := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "workspace_latest_build_total",
Help: "The latest workspace builds with a status.",
Help: "The current number of workspace builds by status.",
}, []string{"status"})
err := registerer.Register(gauge)
if err != nil {
if err := registerer.Register(workspaceLatestBuildTotals); err != nil {
return nil, err
}
workspaceLatestBuildStatuses := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Name: "workspace_latest_build_status",
Help: "The current workspace statuses by template, transition, and owner.",
}, []string{"status", "template_name", "template_version", "workspace_owner", "workspace_transition"})
if err := registerer.Register(workspaceLatestBuildStatuses); err != nil {
return nil, err
}
// This exists so the prometheus metric exports immediately when set.
// It helps with tests so they don't have to wait for a tick.
gauge.WithLabelValues("pending").Set(0)
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)
updateWorkspaceTotals := func() {
builds, err := db.GetLatestWorkspaceBuilds(ctx)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceLatestBuildTotals.Reset()
}
logger.Warn(ctx, "failed to load latest workspace builds", slog.Error(err))
return
}
jobIDs := make([]uuid.UUID, 0, len(builds))
@ -110,16 +118,53 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
}
jobs, err := db.GetProvisionerJobsByIDs(ctx, jobIDs)
if err != nil {
ids := make([]string, 0, len(jobIDs))
for _, id := range jobIDs {
ids = append(ids, id.String())
}
logger.Warn(ctx, "failed to load provisioner jobs", slog.F("ids", ids), slog.Error(err))
return
}
gauge.Reset()
workspaceLatestBuildTotals.Reset()
for _, job := range jobs {
status := codersdk.ProvisionerJobStatus(job.JobStatus)
gauge.WithLabelValues(string(status)).Add(1)
workspaceLatestBuildTotals.WithLabelValues(string(status)).Add(1)
}
}
updateWorkspaceStatuses := func() {
ws, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{
Deleted: false,
WithSummary: false,
})
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
// clear all series if there are no database entries
workspaceLatestBuildStatuses.Reset()
}
logger.Warn(ctx, "failed to load active workspaces", slog.Error(err))
return
}
workspaceLatestBuildStatuses.Reset()
for _, w := range ws {
workspaceLatestBuildStatuses.WithLabelValues(string(w.LatestBuildStatus), w.TemplateName, w.TemplateVersionName.String, w.Username, string(w.LatestBuildTransition)).Add(1)
}
}
// Use time.Nanosecond to force an initial tick. It will be reset to the
// correct duration after executing once.
ticker := time.NewTicker(time.Nanosecond)
doTick := func() {
defer ticker.Reset(duration)
updateWorkspaceTotals()
updateWorkspaceStatuses()
}
go func() {
defer close(done)
defer ticker.Stop()
@ -141,7 +186,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa
// Agents tracks the total number of workspaces with labels on status.
func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMapFn func() *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) {
if duration == 0 {
duration = 1 * time.Minute
duration = defaultRefreshRate
}
agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
@ -330,7 +375,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis
func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration, aggregateByLabels []string) (func(), error) {
if duration == 0 {
duration = 1 * time.Minute
duration = defaultRefreshRate
}
if len(aggregateByLabels) == 0 {