Merge branch 'main' of github.com:/coder/coder into dk/prebuilds

Signed-off-by: Danny Kopping <danny@coder.com>
This commit is contained in:
Danny Kopping
2025-02-19 15:40:19 +00:00
275 changed files with 10767 additions and 1322 deletions

View File

@ -17,10 +17,12 @@ import (
"cdr.dev/slog"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/appearance"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/externalauth"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/coderd/workspacestats"
@ -29,6 +31,7 @@ import (
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/tailnet"
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/quartz"
)
// API implements the DRPC agent API interface from agent/proto. This struct is
@ -59,7 +62,9 @@ type Options struct {
Ctx context.Context
Log slog.Logger
Clock quartz.Clock
Database database.Store
NotificationsEnqueuer notifications.Enqueuer
Pubsub pubsub.Pubsub
DerpMapFn func() *tailcfg.DERPMap
TailnetCoordinator *atomic.Pointer[tailnet.Coordinator]
@ -82,6 +87,10 @@ type Options struct {
}
func New(opts Options) *API {
if opts.Clock == nil {
opts.Clock = quartz.NewReal()
}
api := &API{
opts: opts,
mu: sync.Mutex{},
@ -106,9 +115,22 @@ func New(opts Options) *API {
}
api.ResourcesMonitoringAPI = &ResourcesMonitoringAPI{
Log: opts.Log,
AgentID: opts.AgentID,
Database: opts.Database,
AgentID: opts.AgentID,
WorkspaceID: opts.WorkspaceID,
Clock: opts.Clock,
Database: opts.Database,
NotificationsEnqueuer: opts.NotificationsEnqueuer,
Debounce: 5 * time.Minute,
Config: resourcesmonitor.Config{
NumDatapoints: 20,
CollectionInterval: 10 * time.Second,
Alert: resourcesmonitor.AlertConfig{
MinimumNOKsPercent: 20,
ConsecutiveNOKsPercent: 50,
},
},
}
api.StatsAPI = &StatsAPI{

View File

@ -4,20 +4,35 @@ import (
"context"
"database/sql"
"errors"
"fmt"
"time"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/google/uuid"
"cdr.dev/slog"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/quartz"
)
type ResourcesMonitoringAPI struct {
AgentID uuid.UUID
Database database.Store
Log slog.Logger
AgentID uuid.UUID
WorkspaceID uuid.UUID
Log slog.Logger
Clock quartz.Clock
Database database.Store
NotificationsEnqueuer notifications.Enqueuer
Debounce time.Duration
Config resourcesmonitor.Config
}
func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) {
@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
return &proto.GetResourcesMonitoringConfigurationResponse{
Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{
CollectionIntervalSeconds: 10,
NumDatapoints: 20,
CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()),
NumDatapoints: a.Config.NumDatapoints,
},
Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory {
if memoryErr != nil {
@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
}
func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) {
a.Log.Info(ctx, "resources monitoring usage received",
slog.F("request", req))
var err error
return &proto.PushResourcesMonitoringUsageResponse{}, nil
if memoryErr := a.monitorMemory(ctx, req.Datapoints); memoryErr != nil {
err = errors.Join(err, xerrors.Errorf("monitor memory: %w", memoryErr))
}
if volumeErr := a.monitorVolumes(ctx, req.Datapoints); volumeErr != nil {
err = errors.Join(err, xerrors.Errorf("monitor volume: %w", volumeErr))
}
return &proto.PushResourcesMonitoringUsageResponse{}, err
}
func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
monitor, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID)
if err != nil {
// It is valid for an agent to not have a memory monitor, so we
// do not want to treat it as an error.
if errors.Is(err, sql.ErrNoRows) {
return nil
}
return xerrors.Errorf("fetch memory resource monitor: %w", err)
}
if !monitor.Enabled {
return nil
}
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage, 0, len(datapoints))
for _, datapoint := range datapoints {
usageDatapoints = append(usageDatapoints, datapoint.Memory)
}
usageStates := resourcesmonitor.CalculateMemoryUsageStates(monitor, usageDatapoints)
oldState := monitor.State
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)
//nolint:gocritic // We need to be able to update the resource monitor here.
err = a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
AgentID: a.AgentID,
State: newState,
UpdatedAt: dbtime.Time(a.Clock.Now()),
DebouncedUntil: dbtime.Time(debouncedUntil),
})
if err != nil {
return xerrors.Errorf("update workspace monitor: %w", err)
}
if !shouldNotify {
return nil
}
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
_, err = a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfMemory,
map[string]string{
"workspace": workspace.Name,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
},
map[string]any{
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-memory",
)
if err != nil {
return xerrors.Errorf("notify workspace OOM: %w", err)
}
return nil
}
func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID)
if err != nil {
return xerrors.Errorf("get or insert volume monitor: %w", err)
}
outOfDiskVolumes := make([]map[string]any, 0)
for _, monitor := range volumeMonitors {
if !monitor.Enabled {
continue
}
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage, 0, len(datapoints))
for _, datapoint := range datapoints {
var usage *proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage
for _, volume := range datapoint.Volumes {
if volume.Volume == monitor.Path {
usage = volume
break
}
}
usageDatapoints = append(usageDatapoints, usage)
}
usageStates := resourcesmonitor.CalculateVolumeUsageStates(monitor, usageDatapoints)
oldState := monitor.State
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)
if shouldNotify {
outOfDiskVolumes = append(outOfDiskVolumes, map[string]any{
"path": monitor.Path,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
})
}
//nolint:gocritic // We need to be able to update the resource monitor here.
if err := a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{
AgentID: a.AgentID,
Path: monitor.Path,
State: newState,
UpdatedAt: dbtime.Time(a.Clock.Now()),
DebouncedUntil: dbtime.Time(debouncedUntil),
}); err != nil {
return xerrors.Errorf("update workspace monitor: %w", err)
}
}
if len(outOfDiskVolumes) == 0 {
return nil
}
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
if _, err := a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfDisk,
map[string]string{
"workspace": workspace.Name,
},
map[string]any{
"volumes": outOfDiskVolumes,
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-volumes",
); err != nil {
return xerrors.Errorf("notify workspace OOD: %w", err)
}
return nil
}

View File

@ -0,0 +1,944 @@
package agentapi_test
import (
"context"
"testing"
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/timestamppb"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
"github.com/coder/quartz"
)
func resourceMonitorAPI(t *testing.T) (*agentapi.ResourcesMonitoringAPI, database.User, *quartz.Mock, *notificationstest.FakeEnqueuer) {
t.Helper()
db, _ := dbtestutil.NewDB(t)
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
template := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: user.ID,
})
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{Valid: true, UUID: template.ID},
OrganizationID: org.ID,
CreatedBy: user.ID,
})
workspace := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: template.ID,
OwnerID: user.ID,
})
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
JobID: job.ID,
WorkspaceID: workspace.ID,
TemplateVersionID: templateVersion.ID,
})
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
JobID: build.JobID,
})
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resource.ID,
})
notifyEnq := &notificationstest.FakeEnqueuer{}
clock := quartz.NewMock(t)
return &agentapi.ResourcesMonitoringAPI{
AgentID: agent.ID,
WorkspaceID: workspace.ID,
Clock: clock,
Database: db,
NotificationsEnqueuer: notifyEnq,
Config: resourcesmonitor.Config{
NumDatapoints: 20,
CollectionInterval: 10 * time.Second,
Alert: resourcesmonitor.AlertConfig{
MinimumNOKsPercent: 20,
ConsecutiveNOKsPercent: 50,
},
},
Debounce: 1 * time.Minute,
}, user, clock, notifyEnq
}
func TestMemoryResourceMonitorDebounce(t *testing.T) {
t.Parallel()
// This test is a bit of a long one. We're testing that
// when a monitor goes into an alert state, it doesn't
// allow another notification to occur until after the
// debounce period.
//
// 1. OK -> NOK |> sends a notification
// 2. NOK -> OK |> does nothing
// 3. OK -> NOK |> does nothing due to debounce period
// 4. NOK -> OK |> does nothing
// 5. OK -> NOK |> sends a notification as debounce period exceeded
api, user, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 100
// Given: A monitor in an OK state
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: The monitor is given a state that will trigger NOK
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect there to be a notification sent
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
notifyEnq.Clear()
// When: The monitor moves to an OK state from NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to a NOK state before the debounced time.
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no new notifications (showing the debouncer working)
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to an OK state from NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We still expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to a NOK state after the debounce period.
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect a notification
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
}
func TestMemoryResourceMonitor(t *testing.T) {
t.Parallel()
tests := []struct {
name string
memoryUsage []int64
memoryTotal int64
previousState database.WorkspaceAgentMonitorState
expectState database.WorkspaceAgentMonitorState
shouldNotify bool
}{
{
name: "WhenOK/NeverExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ShouldStayInOK",
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ConsecutiveExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenOK/MinimumExceedsThreshold",
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenNOK/NeverExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenNOK/ShouldStayInNOK",
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/ConsecutiveExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/MinimumExceedsThreshold",
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
api, user, clock, notifyEnq := resourceMonitorAPI(t)
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.memoryUsage))
collectedAt := clock.Now()
for _, usage := range tt.memoryUsage {
collectedAt = collectedAt.Add(15 * time.Second)
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
CollectedAt: timestamppb.New(collectedAt),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: usage,
Total: tt.memoryTotal,
},
})
}
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: tt.previousState,
Threshold: 80,
})
clock.Set(collectedAt)
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: datapoints,
})
require.NoError(t, err)
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
if tt.shouldNotify {
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
} else {
require.Len(t, sent, 0)
}
})
}
}
func TestMemoryResourceMonitorMissingData(t *testing.T) {
t.Parallel()
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
t.Parallel()
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in an OK state.
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two NOK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Memory: nil,
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
// Then: We expect the monitor to still be in an OK state.
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitor.State)
})
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
t.Parallel()
api, _, clock, _ := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in a NOK state.
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two OK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Memory: nil,
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect the monitor to still be in a NOK state.
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitor.State)
})
}
func TestVolumeResourceMonitorDebounce(t *testing.T) {
t.Parallel()
// This test is an even longer one. We're testing
// that the debounce logic is independent per
// volume monitor. We interleave the triggering
// of each monitor to ensure the debounce logic
// is monitor independent.
//
// First Monitor:
// 1. OK -> NOK |> sends a notification
// 2. NOK -> OK |> does nothing
// 3. OK -> NOK |> does nothing due to debounce period
// 4. NOK -> OK |> does nothing
// 5. OK -> NOK |> sends a notification as debounce period exceeded
// 6. NOK -> OK |> does nothing
//
// Second Monitor:
// 1. OK -> OK |> does nothing
// 2. OK -> NOK |> sends a notification
// 3. NOK -> OK |> does nothing
// 4. OK -> NOK |> does nothing due to debounce period
// 5. NOK -> OK |> does nothing
// 6. OK -> NOK |> sends a notification as debounce period exceeded
//
firstVolumePath := "/home/coder"
secondVolumePath := "/dev/coder"
api, _, clock, notifyEnq := resourceMonitorAPI(t)
// Given:
// - First monitor in an OK state
// - Second monitor in an OK state
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: firstVolumePath,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: secondVolumePath,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// When:
// - First monitor is in a NOK state
// - Second monitor is in an OK state
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the first monitor
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes := requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, firstVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First monitor moves back to OK
// - Second monitor moves to NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the second monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, secondVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First monitor moves back to NOK before debounce period has ended
// - Second monitor moves back to OK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When:
// - First monitor moves back to OK
// - Second monitor moves back to NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect no new notifications.
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When:
// - First monitor moves back to a NOK state after the debounce period
// - Second monitor moves back to OK
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the first monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, firstVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First montior moves back to OK
// - Second monitor moves back to NOK after the debounce period
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the second monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, secondVolumePath, volumes[0]["path"])
}
func TestVolumeResourceMonitor(t *testing.T) {
t.Parallel()
tests := []struct {
name string
volumePath string
volumeUsage []int64
volumeTotal int64
thresholdPercent int32
previousState database.WorkspaceAgentMonitorState
expectState database.WorkspaceAgentMonitorState
shouldNotify bool
}{
{
name: "WhenOK/NeverExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ShouldStayInOK",
volumePath: "/home/coder",
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ConsecutiveExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenOK/MinimumExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenNOK/NeverExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenNOK/ShouldStayInNOK",
volumePath: "/home/coder",
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/ConsecutiveExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/MinimumExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
api, user, clock, notifyEnq := resourceMonitorAPI(t)
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.volumeUsage))
collectedAt := clock.Now()
for _, volumeUsage := range tt.volumeUsage {
collectedAt = collectedAt.Add(15 * time.Second)
volumeDatapoints := []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: tt.volumePath,
Used: volumeUsage,
Total: tt.volumeTotal,
},
}
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
CollectedAt: timestamppb.New(collectedAt),
Volumes: volumeDatapoints,
})
}
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: tt.volumePath,
State: tt.previousState,
Threshold: tt.thresholdPercent,
})
clock.Set(collectedAt)
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: datapoints,
})
require.NoError(t, err)
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
if tt.shouldNotify {
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
} else {
require.Len(t, sent, 0)
}
})
}
}
func TestVolumeResourceMonitorMultiple(t *testing.T) {
t.Parallel()
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 100
// Given: two different volume resource monitors
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: "/home/coder",
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: "/dev/coder",
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: both of them move to a NOK state
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: "/home/coder",
Used: 10,
Total: 10,
},
{
Volume: "/dev/coder",
Used: 10,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect a notification to alert with information about both
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes := requireVolumeData(t, sent[0])
require.Len(t, volumes, 2)
require.Equal(t, "/home/coder", volumes[0]["path"])
require.Equal(t, "/dev/coder", volumes[1]["path"])
}
func TestVolumeResourceMonitorMissingData(t *testing.T) {
t.Parallel()
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
t.Parallel()
volumePath := "/home/coder"
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in an OK state.
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: volumePath,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two NOK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 10,
Total: 10,
},
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 10,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
// Then: We expect the monitor to still be in an OK state.
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Len(t, monitors, 1)
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitors[0].State)
})
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
t.Parallel()
volumePath := "/home/coder"
api, _, clock, _ := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in a NOK state.
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: volumePath,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two OK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 1,
Total: 10,
},
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 1,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect the monitor to still be in a NOK state.
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Len(t, monitors, 1)
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitors[0].State)
})
}
func requireVolumeData(t *testing.T, notif *notificationstest.FakeNotification) []map[string]any {
t.Helper()
volumesData := notif.Data["volumes"]
require.IsType(t, []map[string]any{}, volumesData)
return volumesData.([]map[string]any)
}

View File

@ -0,0 +1,129 @@
package resourcesmonitor
import (
"math"
"time"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/util/slice"
)
type State int
const (
StateOK State = iota
StateNOK
StateUnknown
)
type AlertConfig struct {
// What percentage of datapoints in a row are
// required to put the monitor in an alert state.
ConsecutiveNOKsPercent int
// What percentage of datapoints in a window are
// required to put the monitor in an alert state.
MinimumNOKsPercent int
}
type Config struct {
// How many datapoints should the agent send
NumDatapoints int32
// How long between each datapoint should
// collection occur.
CollectionInterval time.Duration
Alert AlertConfig
}
func CalculateMemoryUsageStates(
monitor database.WorkspaceAgentMemoryResourceMonitor,
datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage,
) []State {
states := make([]State, 0, len(datapoints))
for _, datapoint := range datapoints {
state := StateUnknown
if datapoint != nil {
percent := int32(float64(datapoint.Used) / float64(datapoint.Total) * 100)
if percent < monitor.Threshold {
state = StateOK
} else {
state = StateNOK
}
}
states = append(states, state)
}
return states
}
func CalculateVolumeUsageStates(
monitor database.WorkspaceAgentVolumeResourceMonitor,
datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage,
) []State {
states := make([]State, 0, len(datapoints))
for _, datapoint := range datapoints {
state := StateUnknown
if datapoint != nil {
percent := int32(float64(datapoint.Used) / float64(datapoint.Total) * 100)
if percent < monitor.Threshold {
state = StateOK
} else {
state = StateNOK
}
}
states = append(states, state)
}
return states
}
func NextState(c Config, oldState database.WorkspaceAgentMonitorState, states []State) database.WorkspaceAgentMonitorState {
// If there are enough consecutive NOK states, we should be in an
// alert state.
consecutiveNOKs := slice.CountConsecutive(StateNOK, states...)
if percent(consecutiveNOKs, len(states)) >= c.Alert.ConsecutiveNOKsPercent {
return database.WorkspaceAgentMonitorStateNOK
}
// We do not explicitly handle StateUnknown because it could have
// been either StateOK or StateNOK if collection didn't fail. As
// it could be either, our best bet is to ignore it.
nokCount, okCount := 0, 0
for _, state := range states {
switch state {
case StateOK:
okCount++
case StateNOK:
nokCount++
}
}
// If there are enough NOK datapoints, we should be in an alert state.
if percent(nokCount, len(states)) >= c.Alert.MinimumNOKsPercent {
return database.WorkspaceAgentMonitorStateNOK
}
// If all datapoints are OK, we should be in an OK state
if okCount == len(states) {
return database.WorkspaceAgentMonitorStateOK
}
// Otherwise we stay in the same state as last.
return oldState
}
func percent[T int](numerator, denominator T) int {
percent := float64(numerator*100) / float64(denominator)
return int(math.Round(percent))
}