feat: add prometheus metric for tracking user statuses (#15281)

This commit is contained in:
Colin Adler
2024-10-30 13:41:16 -05:00
committed by GitHub
parent e9fbfcc45b
commit 3de98c25db
3 changed files with 158 additions and 3 deletions

View File

@ -212,10 +212,16 @@ func enablePrometheus(
options.PrometheusRegistry.MustRegister(collectors.NewGoCollector())
options.PrometheusRegistry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
closeUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.PrometheusRegistry, options.Database, 0)
closeActiveUsersFunc, err := prometheusmetrics.ActiveUsers(ctx, options.Logger.Named("active_user_metrics"), options.PrometheusRegistry, options.Database, 0)
if err != nil {
return nil, xerrors.Errorf("register active users prometheus metric: %w", err)
}
afterCtx(ctx, closeActiveUsersFunc)
closeUsersFunc, err := prometheusmetrics.Users(ctx, options.Logger.Named("user_metrics"), quartz.NewReal(), options.PrometheusRegistry, options.Database, 0)
if err != nil {
return nil, xerrors.Errorf("register users prometheus metric: %w", err)
}
afterCtx(ctx, closeUsersFunc)
closeWorkspacesFunc, err := prometheusmetrics.Workspaces(ctx, options.Logger.Named("workspaces_metrics"), options.PrometheusRegistry, options.Database, 0)

View File

@ -12,6 +12,7 @@ import (
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"
"cdr.dev/slog"
@ -22,12 +23,13 @@ import (
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/tailnet"
"github.com/coder/quartz"
)
const defaultRefreshRate = time.Minute
// ActiveUsers tracks the number of users that have authenticated within the past hour.
func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
func ActiveUsers(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
duration = defaultRefreshRate
}
@ -58,6 +60,7 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
apiKeys, err := db.GetAPIKeysLastUsedAfter(ctx, dbtime.Now().Add(-1*time.Hour))
if err != nil {
logger.Error(ctx, "get api keys for active users prometheus metric", slog.Error(err))
continue
}
distinctUsers := map[uuid.UUID]struct{}{}
@ -73,6 +76,57 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab
}, nil
}
// Users tracks the total number of registered users, partitioned by status.
func Users(ctx context.Context, logger slog.Logger, clk quartz.Clock, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {
// It's not super important this tracks real-time.
duration = defaultRefreshRate * 5
}
gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "api",
Name: "total_user_count",
Help: "The total number of registered users, partitioned by status.",
}, []string{"status"})
err := registerer.Register(gauge)
if err != nil {
return nil, xerrors.Errorf("register total_user_count gauge: %w", err)
}
ctx, cancelFunc := context.WithCancel(ctx)
done := make(chan struct{})
ticker := clk.NewTicker(duration)
go func() {
defer close(done)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
}
gauge.Reset()
//nolint:gocritic // This is a system service that needs full access
//to the users table.
users, err := db.GetUsers(dbauthz.AsSystemRestricted(ctx), database.GetUsersParams{})
if err != nil {
logger.Error(ctx, "get all users for prometheus metrics", slog.Error(err))
continue
}
for _, user := range users {
gauge.WithLabelValues(string(user.Status)).Inc()
}
}
}()
return func() {
cancelFunc()
<-done
}, nil
}
// Workspaces tracks the total number of workspaces with labels on status.
func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) {
if duration == 0 {

View File

@ -38,6 +38,7 @@ import (
"github.com/coder/coder/v2/tailnet"
"github.com/coder/coder/v2/tailnet/tailnettest"
"github.com/coder/coder/v2/testutil"
"github.com/coder/quartz"
)
func TestActiveUsers(t *testing.T) {
@ -98,7 +99,7 @@ func TestActiveUsers(t *testing.T) {
t.Run(tc.Name, func(t *testing.T) {
t.Parallel()
registry := prometheus.NewRegistry()
closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), registry, tc.Database(t), time.Millisecond)
closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), slogtest.Make(t, nil), registry, tc.Database(t), time.Millisecond)
require.NoError(t, err)
t.Cleanup(closeFunc)
@ -112,6 +113,100 @@ func TestActiveUsers(t *testing.T) {
}
}
func TestUsers(t *testing.T) {
t.Parallel()
for _, tc := range []struct {
Name string
Database func(t *testing.T) database.Store
Count map[database.UserStatus]int
}{{
Name: "None",
Database: func(t *testing.T) database.Store {
return dbmem.New()
},
Count: map[database.UserStatus]int{},
}, {
Name: "One",
Database: func(t *testing.T) database.Store {
db := dbmem.New()
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
return db
},
Count: map[database.UserStatus]int{database.UserStatusActive: 1},
}, {
Name: "MultipleStatuses",
Database: func(t *testing.T) database.Store {
db := dbmem.New()
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
dbgen.User(t, db, database.User{Status: database.UserStatusDormant})
return db
},
Count: map[database.UserStatus]int{database.UserStatusActive: 1, database.UserStatusDormant: 1},
}, {
Name: "MultipleActive",
Database: func(t *testing.T) database.Store {
db := dbmem.New()
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
dbgen.User(t, db, database.User{Status: database.UserStatusActive})
return db
},
Count: map[database.UserStatus]int{database.UserStatusActive: 3},
}} {
tc := tc
t.Run(tc.Name, func(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
registry := prometheus.NewRegistry()
mClock := quartz.NewMock(t)
db := tc.Database(t)
closeFunc, err := prometheusmetrics.Users(context.Background(), slogtest.Make(t, nil), mClock, registry, db, time.Millisecond)
require.NoError(t, err)
t.Cleanup(closeFunc)
_, w := mClock.AdvanceNext()
w.MustWait(ctx)
checkFn := func() bool {
metrics, err := registry.Gather()
if err != nil {
return false
}
// If we get no metrics and we know none should exist, bail
// early. If we get no metrics but we expect some, retry.
if len(metrics) == 0 {
return len(tc.Count) == 0
}
for _, metric := range metrics[0].Metric {
if tc.Count[database.UserStatus(*metric.Label[0].Value)] != int(metric.Gauge.GetValue()) {
return false
}
}
return true
}
require.Eventually(t, checkFn, testutil.WaitShort, testutil.IntervalFast)
// Add another dormant user and ensure it updates
dbgen.User(t, db, database.User{Status: database.UserStatusDormant})
tc.Count[database.UserStatusDormant]++
_, w = mClock.AdvanceNext()
w.MustWait(ctx)
require.Eventually(t, checkFn, testutil.WaitShort, testutil.IntervalFast)
})
}
}
func TestWorkspaceLatestBuildTotals(t *testing.T) {
t.Parallel()