mirror of
https://github.com/coder/coder.git
synced 2025-07-13 21:36:50 +00:00
Merge branch 'main' of github.com:/coder/coder into dk/prebuilds
Signed-off-by: Danny Kopping <danny@coder.com>
This commit is contained in:
@ -17,10 +17,12 @@ import (
|
||||
|
||||
"cdr.dev/slog"
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
|
||||
"github.com/coder/coder/v2/coderd/appearance"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/pubsub"
|
||||
"github.com/coder/coder/v2/coderd/externalauth"
|
||||
"github.com/coder/coder/v2/coderd/notifications"
|
||||
"github.com/coder/coder/v2/coderd/prometheusmetrics"
|
||||
"github.com/coder/coder/v2/coderd/tracing"
|
||||
"github.com/coder/coder/v2/coderd/workspacestats"
|
||||
@ -29,6 +31,7 @@ import (
|
||||
"github.com/coder/coder/v2/codersdk/agentsdk"
|
||||
"github.com/coder/coder/v2/tailnet"
|
||||
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
|
||||
"github.com/coder/quartz"
|
||||
)
|
||||
|
||||
// API implements the DRPC agent API interface from agent/proto. This struct is
|
||||
@ -59,7 +62,9 @@ type Options struct {
|
||||
|
||||
Ctx context.Context
|
||||
Log slog.Logger
|
||||
Clock quartz.Clock
|
||||
Database database.Store
|
||||
NotificationsEnqueuer notifications.Enqueuer
|
||||
Pubsub pubsub.Pubsub
|
||||
DerpMapFn func() *tailcfg.DERPMap
|
||||
TailnetCoordinator *atomic.Pointer[tailnet.Coordinator]
|
||||
@ -82,6 +87,10 @@ type Options struct {
|
||||
}
|
||||
|
||||
func New(opts Options) *API {
|
||||
if opts.Clock == nil {
|
||||
opts.Clock = quartz.NewReal()
|
||||
}
|
||||
|
||||
api := &API{
|
||||
opts: opts,
|
||||
mu: sync.Mutex{},
|
||||
@ -106,9 +115,22 @@ func New(opts Options) *API {
|
||||
}
|
||||
|
||||
api.ResourcesMonitoringAPI = &ResourcesMonitoringAPI{
|
||||
Log: opts.Log,
|
||||
AgentID: opts.AgentID,
|
||||
Database: opts.Database,
|
||||
AgentID: opts.AgentID,
|
||||
WorkspaceID: opts.WorkspaceID,
|
||||
Clock: opts.Clock,
|
||||
Database: opts.Database,
|
||||
NotificationsEnqueuer: opts.NotificationsEnqueuer,
|
||||
Debounce: 5 * time.Minute,
|
||||
|
||||
Config: resourcesmonitor.Config{
|
||||
NumDatapoints: 20,
|
||||
CollectionInterval: 10 * time.Second,
|
||||
|
||||
Alert: resourcesmonitor.AlertConfig{
|
||||
MinimumNOKsPercent: 20,
|
||||
ConsecutiveNOKsPercent: 50,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
api.StatsAPI = &StatsAPI{
|
||||
|
@ -4,20 +4,35 @@ import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"golang.org/x/xerrors"
|
||||
|
||||
"cdr.dev/slog"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"cdr.dev/slog"
|
||||
"github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtime"
|
||||
"github.com/coder/coder/v2/coderd/notifications"
|
||||
"github.com/coder/quartz"
|
||||
)
|
||||
|
||||
type ResourcesMonitoringAPI struct {
|
||||
AgentID uuid.UUID
|
||||
Database database.Store
|
||||
Log slog.Logger
|
||||
AgentID uuid.UUID
|
||||
WorkspaceID uuid.UUID
|
||||
|
||||
Log slog.Logger
|
||||
Clock quartz.Clock
|
||||
Database database.Store
|
||||
NotificationsEnqueuer notifications.Enqueuer
|
||||
|
||||
Debounce time.Duration
|
||||
Config resourcesmonitor.Config
|
||||
}
|
||||
|
||||
func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) {
|
||||
@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
|
||||
|
||||
return &proto.GetResourcesMonitoringConfigurationResponse{
|
||||
Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{
|
||||
CollectionIntervalSeconds: 10,
|
||||
NumDatapoints: 20,
|
||||
CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()),
|
||||
NumDatapoints: a.Config.NumDatapoints,
|
||||
},
|
||||
Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory {
|
||||
if memoryErr != nil {
|
||||
@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
|
||||
}
|
||||
|
||||
func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) {
|
||||
a.Log.Info(ctx, "resources monitoring usage received",
|
||||
slog.F("request", req))
|
||||
var err error
|
||||
|
||||
return &proto.PushResourcesMonitoringUsageResponse{}, nil
|
||||
if memoryErr := a.monitorMemory(ctx, req.Datapoints); memoryErr != nil {
|
||||
err = errors.Join(err, xerrors.Errorf("monitor memory: %w", memoryErr))
|
||||
}
|
||||
|
||||
if volumeErr := a.monitorVolumes(ctx, req.Datapoints); volumeErr != nil {
|
||||
err = errors.Join(err, xerrors.Errorf("monitor volume: %w", volumeErr))
|
||||
}
|
||||
|
||||
return &proto.PushResourcesMonitoringUsageResponse{}, err
|
||||
}
|
||||
|
||||
func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
|
||||
monitor, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID)
|
||||
if err != nil {
|
||||
// It is valid for an agent to not have a memory monitor, so we
|
||||
// do not want to treat it as an error.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return xerrors.Errorf("fetch memory resource monitor: %w", err)
|
||||
}
|
||||
|
||||
if !monitor.Enabled {
|
||||
return nil
|
||||
}
|
||||
|
||||
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage, 0, len(datapoints))
|
||||
for _, datapoint := range datapoints {
|
||||
usageDatapoints = append(usageDatapoints, datapoint.Memory)
|
||||
}
|
||||
|
||||
usageStates := resourcesmonitor.CalculateMemoryUsageStates(monitor, usageDatapoints)
|
||||
|
||||
oldState := monitor.State
|
||||
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)
|
||||
|
||||
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)
|
||||
|
||||
//nolint:gocritic // We need to be able to update the resource monitor here.
|
||||
err = a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
|
||||
AgentID: a.AgentID,
|
||||
State: newState,
|
||||
UpdatedAt: dbtime.Time(a.Clock.Now()),
|
||||
DebouncedUntil: dbtime.Time(debouncedUntil),
|
||||
})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("update workspace monitor: %w", err)
|
||||
}
|
||||
|
||||
if !shouldNotify {
|
||||
return nil
|
||||
}
|
||||
|
||||
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get workspace by id: %w", err)
|
||||
}
|
||||
|
||||
_, err = a.NotificationsEnqueuer.EnqueueWithData(
|
||||
// nolint:gocritic // We need to be able to send the notification.
|
||||
dbauthz.AsNotifier(ctx),
|
||||
workspace.OwnerID,
|
||||
notifications.TemplateWorkspaceOutOfMemory,
|
||||
map[string]string{
|
||||
"workspace": workspace.Name,
|
||||
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
|
||||
},
|
||||
map[string]any{
|
||||
// NOTE(DanielleMaywood):
|
||||
// When notifications are enqueued, they are checked to be
|
||||
// unique within a single day. This means that if we attempt
|
||||
// to send two OOM notifications for the same workspace on
|
||||
// the same day, the enqueuer will prevent us from sending
|
||||
// a second one. We are inject a timestamp to make the
|
||||
// notifications appear different enough to circumvent this
|
||||
// deduplication logic.
|
||||
"timestamp": a.Clock.Now(),
|
||||
},
|
||||
"workspace-monitor-memory",
|
||||
)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("notify workspace OOM: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
|
||||
volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get or insert volume monitor: %w", err)
|
||||
}
|
||||
|
||||
outOfDiskVolumes := make([]map[string]any, 0)
|
||||
|
||||
for _, monitor := range volumeMonitors {
|
||||
if !monitor.Enabled {
|
||||
continue
|
||||
}
|
||||
|
||||
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage, 0, len(datapoints))
|
||||
for _, datapoint := range datapoints {
|
||||
var usage *proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage
|
||||
|
||||
for _, volume := range datapoint.Volumes {
|
||||
if volume.Volume == monitor.Path {
|
||||
usage = volume
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
usageDatapoints = append(usageDatapoints, usage)
|
||||
}
|
||||
|
||||
usageStates := resourcesmonitor.CalculateVolumeUsageStates(monitor, usageDatapoints)
|
||||
|
||||
oldState := monitor.State
|
||||
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)
|
||||
|
||||
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)
|
||||
|
||||
if shouldNotify {
|
||||
outOfDiskVolumes = append(outOfDiskVolumes, map[string]any{
|
||||
"path": monitor.Path,
|
||||
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
|
||||
})
|
||||
}
|
||||
|
||||
//nolint:gocritic // We need to be able to update the resource monitor here.
|
||||
if err := a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{
|
||||
AgentID: a.AgentID,
|
||||
Path: monitor.Path,
|
||||
State: newState,
|
||||
UpdatedAt: dbtime.Time(a.Clock.Now()),
|
||||
DebouncedUntil: dbtime.Time(debouncedUntil),
|
||||
}); err != nil {
|
||||
return xerrors.Errorf("update workspace monitor: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(outOfDiskVolumes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("get workspace by id: %w", err)
|
||||
}
|
||||
|
||||
if _, err := a.NotificationsEnqueuer.EnqueueWithData(
|
||||
// nolint:gocritic // We need to be able to send the notification.
|
||||
dbauthz.AsNotifier(ctx),
|
||||
workspace.OwnerID,
|
||||
notifications.TemplateWorkspaceOutOfDisk,
|
||||
map[string]string{
|
||||
"workspace": workspace.Name,
|
||||
},
|
||||
map[string]any{
|
||||
"volumes": outOfDiskVolumes,
|
||||
// NOTE(DanielleMaywood):
|
||||
// When notifications are enqueued, they are checked to be
|
||||
// unique within a single day. This means that if we attempt
|
||||
// to send two OOM notifications for the same workspace on
|
||||
// the same day, the enqueuer will prevent us from sending
|
||||
// a second one. We are inject a timestamp to make the
|
||||
// notifications appear different enough to circumvent this
|
||||
// deduplication logic.
|
||||
"timestamp": a.Clock.Now(),
|
||||
},
|
||||
"workspace-monitor-volumes",
|
||||
); err != nil {
|
||||
return xerrors.Errorf("notify workspace OOD: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
944
coderd/agentapi/resources_monitoring_test.go
Normal file
944
coderd/agentapi/resources_monitoring_test.go
Normal file
@ -0,0 +1,944 @@
|
||||
package agentapi_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/protobuf/types/known/timestamppb"
|
||||
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/agentapi"
|
||||
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/notifications"
|
||||
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
|
||||
"github.com/coder/quartz"
|
||||
)
|
||||
|
||||
func resourceMonitorAPI(t *testing.T) (*agentapi.ResourcesMonitoringAPI, database.User, *quartz.Mock, *notificationstest.FakeEnqueuer) {
|
||||
t.Helper()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
user := dbgen.User(t, db, database.User{})
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
template := dbgen.Template(t, db, database.Template{
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
})
|
||||
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{Valid: true, UUID: template.ID},
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
})
|
||||
workspace := dbgen.Workspace(t, db, database.WorkspaceTable{
|
||||
OrganizationID: org.ID,
|
||||
TemplateID: template.ID,
|
||||
OwnerID: user.ID,
|
||||
})
|
||||
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
|
||||
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
||||
})
|
||||
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
|
||||
JobID: job.ID,
|
||||
WorkspaceID: workspace.ID,
|
||||
TemplateVersionID: templateVersion.ID,
|
||||
})
|
||||
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
|
||||
JobID: build.JobID,
|
||||
})
|
||||
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
|
||||
ResourceID: resource.ID,
|
||||
})
|
||||
|
||||
notifyEnq := ¬ificationstest.FakeEnqueuer{}
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
return &agentapi.ResourcesMonitoringAPI{
|
||||
AgentID: agent.ID,
|
||||
WorkspaceID: workspace.ID,
|
||||
Clock: clock,
|
||||
Database: db,
|
||||
NotificationsEnqueuer: notifyEnq,
|
||||
Config: resourcesmonitor.Config{
|
||||
NumDatapoints: 20,
|
||||
CollectionInterval: 10 * time.Second,
|
||||
|
||||
Alert: resourcesmonitor.AlertConfig{
|
||||
MinimumNOKsPercent: 20,
|
||||
ConsecutiveNOKsPercent: 50,
|
||||
},
|
||||
},
|
||||
Debounce: 1 * time.Minute,
|
||||
}, user, clock, notifyEnq
|
||||
}
|
||||
|
||||
func TestMemoryResourceMonitorDebounce(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// This test is a bit of a long one. We're testing that
|
||||
// when a monitor goes into an alert state, it doesn't
|
||||
// allow another notification to occur until after the
|
||||
// debounce period.
|
||||
//
|
||||
// 1. OK -> NOK |> sends a notification
|
||||
// 2. NOK -> OK |> does nothing
|
||||
// 3. OK -> NOK |> does nothing due to debounce period
|
||||
// 4. NOK -> OK |> does nothing
|
||||
// 5. OK -> NOK |> sends a notification as debounce period exceeded
|
||||
|
||||
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in an OK state
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: The monitor is given a state that will trigger NOK
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect there to be a notification sent
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves to an OK state from NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no new notifications
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves back to a NOK state before the debounced time.
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no new notifications (showing the debouncer working)
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves back to an OK state from NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We still expect no new notifications
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves back to a NOK state after the debounce period.
|
||||
clock.Advance(api.Debounce/4 + 1*time.Second)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect a notification
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
}
|
||||
|
||||
func TestMemoryResourceMonitor(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
memoryUsage []int64
|
||||
memoryTotal int64
|
||||
previousState database.WorkspaceAgentMonitorState
|
||||
expectState database.WorkspaceAgentMonitorState
|
||||
shouldNotify bool
|
||||
}{
|
||||
{
|
||||
name: "WhenOK/NeverExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ShouldStayInOK",
|
||||
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ConsecutiveExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/MinimumExceedsThreshold",
|
||||
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/NeverExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ShouldStayInNOK",
|
||||
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ConsecutiveExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/MinimumExceedsThreshold",
|
||||
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
|
||||
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.memoryUsage))
|
||||
collectedAt := clock.Now()
|
||||
for _, usage := range tt.memoryUsage {
|
||||
collectedAt = collectedAt.Add(15 * time.Second)
|
||||
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
CollectedAt: timestamppb.New(collectedAt),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: usage,
|
||||
Total: tt.memoryTotal,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: tt.previousState,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
clock.Set(collectedAt)
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: datapoints,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
if tt.shouldNotify {
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
} else {
|
||||
require.Len(t, sent, 0)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryResourceMonitorMissingData(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in an OK state.
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two NOK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Memory: nil,
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
|
||||
// Then: We expect the monitor to still be in an OK state.
|
||||
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitor.State)
|
||||
})
|
||||
|
||||
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, _, clock, _ := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in a NOK state.
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: database.WorkspaceAgentMonitorStateNOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two OK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Memory: nil,
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect the monitor to still be in a NOK state.
|
||||
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitor.State)
|
||||
})
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitorDebounce(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// This test is an even longer one. We're testing
|
||||
// that the debounce logic is independent per
|
||||
// volume monitor. We interleave the triggering
|
||||
// of each monitor to ensure the debounce logic
|
||||
// is monitor independent.
|
||||
//
|
||||
// First Monitor:
|
||||
// 1. OK -> NOK |> sends a notification
|
||||
// 2. NOK -> OK |> does nothing
|
||||
// 3. OK -> NOK |> does nothing due to debounce period
|
||||
// 4. NOK -> OK |> does nothing
|
||||
// 5. OK -> NOK |> sends a notification as debounce period exceeded
|
||||
// 6. NOK -> OK |> does nothing
|
||||
//
|
||||
// Second Monitor:
|
||||
// 1. OK -> OK |> does nothing
|
||||
// 2. OK -> NOK |> sends a notification
|
||||
// 3. NOK -> OK |> does nothing
|
||||
// 4. OK -> NOK |> does nothing due to debounce period
|
||||
// 5. NOK -> OK |> does nothing
|
||||
// 6. OK -> NOK |> sends a notification as debounce period exceeded
|
||||
//
|
||||
|
||||
firstVolumePath := "/home/coder"
|
||||
secondVolumePath := "/dev/coder"
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
|
||||
// Given:
|
||||
// - First monitor in an OK state
|
||||
// - Second monitor in an OK state
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: firstVolumePath,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: secondVolumePath,
|
||||
State: database.WorkspaceAgentMonitorStateNOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When:
|
||||
// - First monitor is in a NOK state
|
||||
// - Second monitor is in an OK state
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 10, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 1, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the first monitor
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes := requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, firstVolumePath, volumes[0]["path"])
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to OK
|
||||
// - Second monitor moves to NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 1, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 10, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the second monitor
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes = requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, secondVolumePath, volumes[0]["path"])
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to NOK before debounce period has ended
|
||||
// - Second monitor moves back to OK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 10, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 1, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect no new notifications
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to OK
|
||||
// - Second monitor moves back to NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 1, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 10, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect no new notifications.
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to a NOK state after the debounce period
|
||||
// - Second monitor moves back to OK
|
||||
clock.Advance(api.Debounce/4 + 1*time.Second)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 10, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 1, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the first monitor
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes = requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, firstVolumePath, volumes[0]["path"])
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First montior moves back to OK
|
||||
// - Second monitor moves back to NOK after the debounce period
|
||||
clock.Advance(api.Debounce/4 + 1*time.Second)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 1, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 10, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the second monitor
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes = requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, secondVolumePath, volumes[0]["path"])
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitor(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
volumePath string
|
||||
volumeUsage []int64
|
||||
volumeTotal int64
|
||||
thresholdPercent int32
|
||||
previousState database.WorkspaceAgentMonitorState
|
||||
expectState database.WorkspaceAgentMonitorState
|
||||
shouldNotify bool
|
||||
}{
|
||||
{
|
||||
name: "WhenOK/NeverExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ShouldStayInOK",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ConsecutiveExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/MinimumExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/NeverExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ShouldStayInNOK",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ConsecutiveExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/MinimumExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
|
||||
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.volumeUsage))
|
||||
collectedAt := clock.Now()
|
||||
for _, volumeUsage := range tt.volumeUsage {
|
||||
collectedAt = collectedAt.Add(15 * time.Second)
|
||||
|
||||
volumeDatapoints := []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: tt.volumePath,
|
||||
Used: volumeUsage,
|
||||
Total: tt.volumeTotal,
|
||||
},
|
||||
}
|
||||
|
||||
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
CollectedAt: timestamppb.New(collectedAt),
|
||||
Volumes: volumeDatapoints,
|
||||
})
|
||||
}
|
||||
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: tt.volumePath,
|
||||
State: tt.previousState,
|
||||
Threshold: tt.thresholdPercent,
|
||||
})
|
||||
|
||||
clock.Set(collectedAt)
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: datapoints,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
if tt.shouldNotify {
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
} else {
|
||||
require.Len(t, sent, 0)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitorMultiple(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 100
|
||||
|
||||
// Given: two different volume resource monitors
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: "/home/coder",
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: "/dev/coder",
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: both of them move to a NOK state
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: "/home/coder",
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
{
|
||||
Volume: "/dev/coder",
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect a notification to alert with information about both
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
|
||||
volumes := requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 2)
|
||||
require.Equal(t, "/home/coder", volumes[0]["path"])
|
||||
require.Equal(t, "/dev/coder", volumes[1]["path"])
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitorMissingData(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
volumePath := "/home/coder"
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in an OK state.
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: volumePath,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two NOK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 0)
|
||||
|
||||
// Then: We expect the monitor to still be in an OK state.
|
||||
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, monitors, 1)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitors[0].State)
|
||||
})
|
||||
|
||||
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
volumePath := "/home/coder"
|
||||
|
||||
api, _, clock, _ := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in a NOK state.
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: volumePath,
|
||||
State: database.WorkspaceAgentMonitorStateNOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two OK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect the monitor to still be in a NOK state.
|
||||
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, monitors, 1)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitors[0].State)
|
||||
})
|
||||
}
|
||||
|
||||
func requireVolumeData(t *testing.T, notif *notificationstest.FakeNotification) []map[string]any {
|
||||
t.Helper()
|
||||
|
||||
volumesData := notif.Data["volumes"]
|
||||
require.IsType(t, []map[string]any{}, volumesData)
|
||||
|
||||
return volumesData.([]map[string]any)
|
||||
}
|
129
coderd/agentapi/resourcesmonitor/resources_monitor.go
Normal file
129
coderd/agentapi/resourcesmonitor/resources_monitor.go
Normal file
@ -0,0 +1,129 @@
|
||||
package resourcesmonitor
|
||||
|
||||
import (
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/util/slice"
|
||||
)
|
||||
|
||||
type State int
|
||||
|
||||
const (
|
||||
StateOK State = iota
|
||||
StateNOK
|
||||
StateUnknown
|
||||
)
|
||||
|
||||
type AlertConfig struct {
|
||||
// What percentage of datapoints in a row are
|
||||
// required to put the monitor in an alert state.
|
||||
ConsecutiveNOKsPercent int
|
||||
|
||||
// What percentage of datapoints in a window are
|
||||
// required to put the monitor in an alert state.
|
||||
MinimumNOKsPercent int
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
// How many datapoints should the agent send
|
||||
NumDatapoints int32
|
||||
|
||||
// How long between each datapoint should
|
||||
// collection occur.
|
||||
CollectionInterval time.Duration
|
||||
|
||||
Alert AlertConfig
|
||||
}
|
||||
|
||||
func CalculateMemoryUsageStates(
|
||||
monitor database.WorkspaceAgentMemoryResourceMonitor,
|
||||
datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage,
|
||||
) []State {
|
||||
states := make([]State, 0, len(datapoints))
|
||||
|
||||
for _, datapoint := range datapoints {
|
||||
state := StateUnknown
|
||||
|
||||
if datapoint != nil {
|
||||
percent := int32(float64(datapoint.Used) / float64(datapoint.Total) * 100)
|
||||
|
||||
if percent < monitor.Threshold {
|
||||
state = StateOK
|
||||
} else {
|
||||
state = StateNOK
|
||||
}
|
||||
}
|
||||
|
||||
states = append(states, state)
|
||||
}
|
||||
|
||||
return states
|
||||
}
|
||||
|
||||
func CalculateVolumeUsageStates(
|
||||
monitor database.WorkspaceAgentVolumeResourceMonitor,
|
||||
datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage,
|
||||
) []State {
|
||||
states := make([]State, 0, len(datapoints))
|
||||
|
||||
for _, datapoint := range datapoints {
|
||||
state := StateUnknown
|
||||
|
||||
if datapoint != nil {
|
||||
percent := int32(float64(datapoint.Used) / float64(datapoint.Total) * 100)
|
||||
|
||||
if percent < monitor.Threshold {
|
||||
state = StateOK
|
||||
} else {
|
||||
state = StateNOK
|
||||
}
|
||||
}
|
||||
|
||||
states = append(states, state)
|
||||
}
|
||||
|
||||
return states
|
||||
}
|
||||
|
||||
func NextState(c Config, oldState database.WorkspaceAgentMonitorState, states []State) database.WorkspaceAgentMonitorState {
|
||||
// If there are enough consecutive NOK states, we should be in an
|
||||
// alert state.
|
||||
consecutiveNOKs := slice.CountConsecutive(StateNOK, states...)
|
||||
if percent(consecutiveNOKs, len(states)) >= c.Alert.ConsecutiveNOKsPercent {
|
||||
return database.WorkspaceAgentMonitorStateNOK
|
||||
}
|
||||
|
||||
// We do not explicitly handle StateUnknown because it could have
|
||||
// been either StateOK or StateNOK if collection didn't fail. As
|
||||
// it could be either, our best bet is to ignore it.
|
||||
nokCount, okCount := 0, 0
|
||||
for _, state := range states {
|
||||
switch state {
|
||||
case StateOK:
|
||||
okCount++
|
||||
case StateNOK:
|
||||
nokCount++
|
||||
}
|
||||
}
|
||||
|
||||
// If there are enough NOK datapoints, we should be in an alert state.
|
||||
if percent(nokCount, len(states)) >= c.Alert.MinimumNOKsPercent {
|
||||
return database.WorkspaceAgentMonitorStateNOK
|
||||
}
|
||||
|
||||
// If all datapoints are OK, we should be in an OK state
|
||||
if okCount == len(states) {
|
||||
return database.WorkspaceAgentMonitorStateOK
|
||||
}
|
||||
|
||||
// Otherwise we stay in the same state as last.
|
||||
return oldState
|
||||
}
|
||||
|
||||
func percent[T int](numerator, denominator T) int {
|
||||
percent := float64(numerator*100) / float64(denominator)
|
||||
return int(math.Round(percent))
|
||||
}
|
Reference in New Issue
Block a user