Files
coder/coderd/metricscache/metricscache.go
Danielle Maywood 6dd51f92fb chore: test metricscache on postgres (#16711)
metricscache_test has been running tests against dbmem only, instead of
against postgres. Unfortunately the implementations of
GetTemplateAverageBuildTime have diverged between dbmem and postgres.
This change gets the tests working on Postgres and test for the
behaviour postgres provides.
2025-02-27 09:43:51 +00:00

290 lines
7.4 KiB
Go

package metricscache
import (
"context"
"database/sql"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/quartz"
"github.com/coder/retry"
)
// Cache holds the template metrics.
// The aggregation queries responsible for these values can take up to a minute
// on large deployments. Even in small deployments, aggregation queries can
// take a few hundred milliseconds, which would ruin page load times and
// database performance if in the hot path.
type Cache struct {
database database.Store
log slog.Logger
clock quartz.Clock
intervals Intervals
templateWorkspaceOwners atomic.Pointer[map[uuid.UUID]int]
templateAverageBuildTime atomic.Pointer[map[uuid.UUID]database.GetTemplateAverageBuildTimeRow]
deploymentStatsResponse atomic.Pointer[codersdk.DeploymentStats]
done chan struct{}
cancel func()
// usage is a experiment flag to enable new workspace usage tracking behavior and will be
// removed when the experiment is complete.
usage bool
}
type Intervals struct {
TemplateBuildTimes time.Duration
DeploymentStats time.Duration
}
func New(db database.Store, log slog.Logger, clock quartz.Clock, intervals Intervals, usage bool) *Cache {
if intervals.TemplateBuildTimes <= 0 {
intervals.TemplateBuildTimes = time.Hour
}
if intervals.DeploymentStats <= 0 {
intervals.DeploymentStats = time.Minute
}
ctx, cancel := context.WithCancel(context.Background())
c := &Cache{
clock: clock,
database: db,
intervals: intervals,
log: log,
done: make(chan struct{}),
cancel: cancel,
usage: usage,
}
go func() {
var wg sync.WaitGroup
defer close(c.done)
wg.Add(1)
go func() {
defer wg.Done()
c.run(ctx, "template build times", intervals.TemplateBuildTimes, c.refreshTemplateBuildTimes)
}()
wg.Add(1)
go func() {
defer wg.Done()
c.run(ctx, "deployment stats", intervals.DeploymentStats, c.refreshDeploymentStats)
}()
wg.Wait()
}()
return c
}
func (c *Cache) refreshTemplateBuildTimes(ctx context.Context) error {
//nolint:gocritic // This is a system service.
ctx = dbauthz.AsSystemRestricted(ctx)
templates, err := c.database.GetTemplates(ctx)
if err != nil {
return err
}
var (
templateWorkspaceOwners = make(map[uuid.UUID]int)
templateAverageBuildTimes = make(map[uuid.UUID]database.GetTemplateAverageBuildTimeRow)
)
ids := make([]uuid.UUID, 0, len(templates))
for _, template := range templates {
ids = append(ids, template.ID)
templateAvgBuildTime, err := c.database.GetTemplateAverageBuildTime(ctx, database.GetTemplateAverageBuildTimeParams{
TemplateID: uuid.NullUUID{
UUID: template.ID,
Valid: true,
},
StartTime: sql.NullTime{
Time: dbtime.Time(c.clock.Now().AddDate(0, 0, -30)),
Valid: true,
},
})
if err != nil {
return err
}
templateAverageBuildTimes[template.ID] = templateAvgBuildTime
}
owners, err := c.database.GetWorkspaceUniqueOwnerCountByTemplateIDs(ctx, ids)
if err != nil {
return xerrors.Errorf("get workspace unique owner count by template ids: %w", err)
}
for _, owner := range owners {
templateWorkspaceOwners[owner.TemplateID] = int(owner.UniqueOwnersSum)
}
c.templateWorkspaceOwners.Store(&templateWorkspaceOwners)
c.templateAverageBuildTime.Store(&templateAverageBuildTimes)
return nil
}
func (c *Cache) refreshDeploymentStats(ctx context.Context) error {
var (
from = c.clock.Now().Add(-15 * time.Minute)
agentStats database.GetDeploymentWorkspaceAgentStatsRow
err error
)
if c.usage {
agentUsageStats, err := c.database.GetDeploymentWorkspaceAgentUsageStats(ctx, from)
if err != nil {
return err
}
agentStats = database.GetDeploymentWorkspaceAgentStatsRow(agentUsageStats)
} else {
agentStats, err = c.database.GetDeploymentWorkspaceAgentStats(ctx, from)
if err != nil {
return err
}
}
workspaceStats, err := c.database.GetDeploymentWorkspaceStats(ctx)
if err != nil {
return err
}
c.deploymentStatsResponse.Store(&codersdk.DeploymentStats{
AggregatedFrom: from,
CollectedAt: dbtime.Time(c.clock.Now()),
NextUpdateAt: dbtime.Time(c.clock.Now().Add(c.intervals.DeploymentStats)),
Workspaces: codersdk.WorkspaceDeploymentStats{
Pending: workspaceStats.PendingWorkspaces,
Building: workspaceStats.BuildingWorkspaces,
Running: workspaceStats.RunningWorkspaces,
Failed: workspaceStats.FailedWorkspaces,
Stopped: workspaceStats.StoppedWorkspaces,
ConnectionLatencyMS: codersdk.WorkspaceConnectionLatencyMS{
P50: agentStats.WorkspaceConnectionLatency50,
P95: agentStats.WorkspaceConnectionLatency95,
},
RxBytes: agentStats.WorkspaceRxBytes,
TxBytes: agentStats.WorkspaceTxBytes,
},
SessionCount: codersdk.SessionCountDeploymentStats{
VSCode: agentStats.SessionCountVSCode,
SSH: agentStats.SessionCountSSH,
JetBrains: agentStats.SessionCountJetBrains,
ReconnectingPTY: agentStats.SessionCountReconnectingPTY,
},
})
return nil
}
func (c *Cache) run(ctx context.Context, name string, interval time.Duration, refresh func(context.Context) error) {
logger := c.log.With(slog.F("name", name), slog.F("interval", interval))
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
for r := retry.New(time.Millisecond*100, time.Minute); r.Wait(ctx); {
start := time.Now()
err := refresh(ctx)
if err != nil {
if ctx.Err() != nil {
return
}
if xerrors.Is(err, sql.ErrNoRows) {
break
}
logger.Error(ctx, "refresh metrics failed", slog.Error(err))
continue
}
logger.Debug(ctx, "metrics refreshed", slog.F("took", time.Since(start)))
break
}
select {
case <-ticker.C:
case <-c.done:
return
case <-ctx.Done():
return
}
}
}
func (c *Cache) Close() error {
c.cancel()
<-c.done
return nil
}
func (c *Cache) TemplateBuildTimeStats(id uuid.UUID) codersdk.TemplateBuildTimeStats {
unknown := codersdk.TemplateBuildTimeStats{
codersdk.WorkspaceTransitionStart: {},
codersdk.WorkspaceTransitionStop: {},
codersdk.WorkspaceTransitionDelete: {},
}
m := c.templateAverageBuildTime.Load()
if m == nil {
// Data loading.
return unknown
}
resp, ok := (*m)[id]
if !ok {
// No data or not enough builds.
return unknown
}
convertMillis := func(m float64) *int64 {
if m <= 0 {
return nil
}
i := int64(m * 1000)
return &i
}
return codersdk.TemplateBuildTimeStats{
codersdk.WorkspaceTransitionStart: {
P50: convertMillis(resp.Start50),
P95: convertMillis(resp.Start95),
},
codersdk.WorkspaceTransitionStop: {
P50: convertMillis(resp.Stop50),
P95: convertMillis(resp.Stop95),
},
codersdk.WorkspaceTransitionDelete: {
P50: convertMillis(resp.Delete50),
P95: convertMillis(resp.Delete95),
},
}
}
func (c *Cache) TemplateWorkspaceOwners(id uuid.UUID) (int, bool) {
m := c.templateWorkspaceOwners.Load()
if m == nil {
// Data loading.
return -1, false
}
resp, ok := (*m)[id]
if !ok {
// Probably no data.
return -1, false
}
return resp, true
}
func (c *Cache) DeploymentStats() (codersdk.DeploymentStats, bool) {
deploymentStats := c.deploymentStatsResponse.Load()
if deploymentStats == nil {
return codersdk.DeploymentStats{}, false
}
return *deploymentStats, true
}