mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
chore: implement oom/ood processing component (#16436)
Implements the processing logic as set out in the OOM/OOD RFC.
This commit is contained in:
944
coderd/agentapi/resources_monitoring_test.go
Normal file
944
coderd/agentapi/resources_monitoring_test.go
Normal file
@ -0,0 +1,944 @@
|
||||
package agentapi_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/protobuf/types/known/timestamppb"
|
||||
|
||||
agentproto "github.com/coder/coder/v2/agent/proto"
|
||||
"github.com/coder/coder/v2/coderd/agentapi"
|
||||
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/database/dbgen"
|
||||
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
||||
"github.com/coder/coder/v2/coderd/notifications"
|
||||
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
|
||||
"github.com/coder/quartz"
|
||||
)
|
||||
|
||||
func resourceMonitorAPI(t *testing.T) (*agentapi.ResourcesMonitoringAPI, database.User, *quartz.Mock, *notificationstest.FakeEnqueuer) {
|
||||
t.Helper()
|
||||
|
||||
db, _ := dbtestutil.NewDB(t)
|
||||
user := dbgen.User(t, db, database.User{})
|
||||
org := dbgen.Organization(t, db, database.Organization{})
|
||||
template := dbgen.Template(t, db, database.Template{
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
})
|
||||
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
||||
TemplateID: uuid.NullUUID{Valid: true, UUID: template.ID},
|
||||
OrganizationID: org.ID,
|
||||
CreatedBy: user.ID,
|
||||
})
|
||||
workspace := dbgen.Workspace(t, db, database.WorkspaceTable{
|
||||
OrganizationID: org.ID,
|
||||
TemplateID: template.ID,
|
||||
OwnerID: user.ID,
|
||||
})
|
||||
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
|
||||
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
||||
})
|
||||
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
|
||||
JobID: job.ID,
|
||||
WorkspaceID: workspace.ID,
|
||||
TemplateVersionID: templateVersion.ID,
|
||||
})
|
||||
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
|
||||
JobID: build.JobID,
|
||||
})
|
||||
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
|
||||
ResourceID: resource.ID,
|
||||
})
|
||||
|
||||
notifyEnq := ¬ificationstest.FakeEnqueuer{}
|
||||
clock := quartz.NewMock(t)
|
||||
|
||||
return &agentapi.ResourcesMonitoringAPI{
|
||||
AgentID: agent.ID,
|
||||
WorkspaceID: workspace.ID,
|
||||
Clock: clock,
|
||||
Database: db,
|
||||
NotificationsEnqueuer: notifyEnq,
|
||||
Config: resourcesmonitor.Config{
|
||||
NumDatapoints: 20,
|
||||
CollectionInterval: 10 * time.Second,
|
||||
|
||||
Alert: resourcesmonitor.AlertConfig{
|
||||
MinimumNOKsPercent: 20,
|
||||
ConsecutiveNOKsPercent: 50,
|
||||
},
|
||||
},
|
||||
Debounce: 1 * time.Minute,
|
||||
}, user, clock, notifyEnq
|
||||
}
|
||||
|
||||
func TestMemoryResourceMonitorDebounce(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// This test is a bit of a long one. We're testing that
|
||||
// when a monitor goes into an alert state, it doesn't
|
||||
// allow another notification to occur until after the
|
||||
// debounce period.
|
||||
//
|
||||
// 1. OK -> NOK |> sends a notification
|
||||
// 2. NOK -> OK |> does nothing
|
||||
// 3. OK -> NOK |> does nothing due to debounce period
|
||||
// 4. NOK -> OK |> does nothing
|
||||
// 5. OK -> NOK |> sends a notification as debounce period exceeded
|
||||
|
||||
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in an OK state
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: The monitor is given a state that will trigger NOK
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect there to be a notification sent
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves to an OK state from NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no new notifications
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves back to a NOK state before the debounced time.
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no new notifications (showing the debouncer working)
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves back to an OK state from NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We still expect no new notifications
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When: The monitor moves back to a NOK state after the debounce period.
|
||||
clock.Advance(api.Debounce/4 + 1*time.Second)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect a notification
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
}
|
||||
|
||||
func TestMemoryResourceMonitor(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
memoryUsage []int64
|
||||
memoryTotal int64
|
||||
previousState database.WorkspaceAgentMonitorState
|
||||
expectState database.WorkspaceAgentMonitorState
|
||||
shouldNotify bool
|
||||
}{
|
||||
{
|
||||
name: "WhenOK/NeverExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ShouldStayInOK",
|
||||
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ConsecutiveExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/MinimumExceedsThreshold",
|
||||
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/NeverExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ShouldStayInNOK",
|
||||
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ConsecutiveExceedsThreshold",
|
||||
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/MinimumExceedsThreshold",
|
||||
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
memoryTotal: 10,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
|
||||
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.memoryUsage))
|
||||
collectedAt := clock.Now()
|
||||
for _, usage := range tt.memoryUsage {
|
||||
collectedAt = collectedAt.Add(15 * time.Second)
|
||||
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
CollectedAt: timestamppb.New(collectedAt),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: usage,
|
||||
Total: tt.memoryTotal,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: tt.previousState,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
clock.Set(collectedAt)
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: datapoints,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
if tt.shouldNotify {
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
} else {
|
||||
require.Len(t, sent, 0)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryResourceMonitorMissingData(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in an OK state.
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two NOK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Memory: nil,
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
||||
require.Len(t, sent, 0)
|
||||
|
||||
// Then: We expect the monitor to still be in an OK state.
|
||||
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitor.State)
|
||||
})
|
||||
|
||||
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, _, clock, _ := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in a NOK state.
|
||||
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
State: database.WorkspaceAgentMonitorStateNOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two OK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Memory: nil,
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect the monitor to still be in a NOK state.
|
||||
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitor.State)
|
||||
})
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitorDebounce(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// This test is an even longer one. We're testing
|
||||
// that the debounce logic is independent per
|
||||
// volume monitor. We interleave the triggering
|
||||
// of each monitor to ensure the debounce logic
|
||||
// is monitor independent.
|
||||
//
|
||||
// First Monitor:
|
||||
// 1. OK -> NOK |> sends a notification
|
||||
// 2. NOK -> OK |> does nothing
|
||||
// 3. OK -> NOK |> does nothing due to debounce period
|
||||
// 4. NOK -> OK |> does nothing
|
||||
// 5. OK -> NOK |> sends a notification as debounce period exceeded
|
||||
// 6. NOK -> OK |> does nothing
|
||||
//
|
||||
// Second Monitor:
|
||||
// 1. OK -> OK |> does nothing
|
||||
// 2. OK -> NOK |> sends a notification
|
||||
// 3. NOK -> OK |> does nothing
|
||||
// 4. OK -> NOK |> does nothing due to debounce period
|
||||
// 5. NOK -> OK |> does nothing
|
||||
// 6. OK -> NOK |> sends a notification as debounce period exceeded
|
||||
//
|
||||
|
||||
firstVolumePath := "/home/coder"
|
||||
secondVolumePath := "/dev/coder"
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
|
||||
// Given:
|
||||
// - First monitor in an OK state
|
||||
// - Second monitor in an OK state
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: firstVolumePath,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: secondVolumePath,
|
||||
State: database.WorkspaceAgentMonitorStateNOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When:
|
||||
// - First monitor is in a NOK state
|
||||
// - Second monitor is in an OK state
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 10, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 1, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the first monitor
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes := requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, firstVolumePath, volumes[0]["path"])
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to OK
|
||||
// - Second monitor moves to NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 1, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 10, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the second monitor
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes = requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, secondVolumePath, volumes[0]["path"])
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to NOK before debounce period has ended
|
||||
// - Second monitor moves back to OK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 10, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 1, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect no new notifications
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to OK
|
||||
// - Second monitor moves back to NOK
|
||||
clock.Advance(api.Debounce / 4)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 1, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 10, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect no new notifications.
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 0)
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First monitor moves back to a NOK state after the debounce period
|
||||
// - Second monitor moves back to OK
|
||||
clock.Advance(api.Debounce/4 + 1*time.Second)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 10, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 1, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the first monitor
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes = requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, firstVolumePath, volumes[0]["path"])
|
||||
notifyEnq.Clear()
|
||||
|
||||
// When:
|
||||
// - First montior moves back to OK
|
||||
// - Second monitor moves back to NOK after the debounce period
|
||||
clock.Advance(api.Debounce/4 + 1*time.Second)
|
||||
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{Volume: firstVolumePath, Used: 1, Total: 10},
|
||||
{Volume: secondVolumePath, Used: 10, Total: 10},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then:
|
||||
// - We expect a notification from only the second monitor
|
||||
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
volumes = requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 1)
|
||||
require.Equal(t, secondVolumePath, volumes[0]["path"])
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitor(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
volumePath string
|
||||
volumeUsage []int64
|
||||
volumeTotal int64
|
||||
thresholdPercent int32
|
||||
previousState database.WorkspaceAgentMonitorState
|
||||
expectState database.WorkspaceAgentMonitorState
|
||||
shouldNotify bool
|
||||
}{
|
||||
{
|
||||
name: "WhenOK/NeverExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ShouldStayInOK",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/ConsecutiveExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenOK/MinimumExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: true,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/NeverExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ShouldStayInNOK",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/ConsecutiveExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
{
|
||||
name: "WhenNOK/MinimumExceedsThreshold",
|
||||
volumePath: "/home/coder",
|
||||
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
||||
volumeTotal: 10,
|
||||
thresholdPercent: 80,
|
||||
previousState: database.WorkspaceAgentMonitorStateNOK,
|
||||
expectState: database.WorkspaceAgentMonitorStateNOK,
|
||||
shouldNotify: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
tt := tt
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
|
||||
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.volumeUsage))
|
||||
collectedAt := clock.Now()
|
||||
for _, volumeUsage := range tt.volumeUsage {
|
||||
collectedAt = collectedAt.Add(15 * time.Second)
|
||||
|
||||
volumeDatapoints := []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: tt.volumePath,
|
||||
Used: volumeUsage,
|
||||
Total: tt.volumeTotal,
|
||||
},
|
||||
}
|
||||
|
||||
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
CollectedAt: timestamppb.New(collectedAt),
|
||||
Volumes: volumeDatapoints,
|
||||
})
|
||||
}
|
||||
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: tt.volumePath,
|
||||
State: tt.previousState,
|
||||
Threshold: tt.thresholdPercent,
|
||||
})
|
||||
|
||||
clock.Set(collectedAt)
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: datapoints,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
if tt.shouldNotify {
|
||||
require.Len(t, sent, 1)
|
||||
require.Equal(t, user.ID, sent[0].UserID)
|
||||
} else {
|
||||
require.Len(t, sent, 0)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitorMultiple(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 100
|
||||
|
||||
// Given: two different volume resource monitors
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: "/home/coder",
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: "/dev/coder",
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: both of them move to a NOK state
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: "/home/coder",
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
{
|
||||
Volume: "/dev/coder",
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect a notification to alert with information about both
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 1)
|
||||
|
||||
volumes := requireVolumeData(t, sent[0])
|
||||
require.Len(t, volumes, 2)
|
||||
require.Equal(t, "/home/coder", volumes[0]["path"])
|
||||
require.Equal(t, "/dev/coder", volumes[1]["path"])
|
||||
}
|
||||
|
||||
func TestVolumeResourceMonitorMissingData(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
volumePath := "/home/coder"
|
||||
|
||||
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in an OK state.
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: volumePath,
|
||||
State: database.WorkspaceAgentMonitorStateOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two NOK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 10,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
|
||||
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
||||
require.Len(t, sent, 0)
|
||||
|
||||
// Then: We expect the monitor to still be in an OK state.
|
||||
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, monitors, 1)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitors[0].State)
|
||||
})
|
||||
|
||||
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
volumePath := "/home/coder"
|
||||
|
||||
api, _, clock, _ := resourceMonitorAPI(t)
|
||||
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
||||
api.Config.Alert.MinimumNOKsPercent = 100
|
||||
|
||||
// Given: A monitor in a NOK state.
|
||||
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
||||
AgentID: api.AgentID,
|
||||
Path: volumePath,
|
||||
State: database.WorkspaceAgentMonitorStateNOK,
|
||||
Threshold: 80,
|
||||
})
|
||||
|
||||
// When: A datapoint is missing, surrounded by two OK datapoints.
|
||||
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
||||
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now()),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
|
||||
},
|
||||
{
|
||||
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
||||
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
||||
{
|
||||
Volume: volumePath,
|
||||
Used: 1,
|
||||
Total: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Then: We expect the monitor to still be in a NOK state.
|
||||
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, monitors, 1)
|
||||
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitors[0].State)
|
||||
})
|
||||
}
|
||||
|
||||
func requireVolumeData(t *testing.T, notif *notificationstest.FakeNotification) []map[string]any {
|
||||
t.Helper()
|
||||
|
||||
volumesData := notif.Data["volumes"]
|
||||
require.IsType(t, []map[string]any{}, volumesData)
|
||||
|
||||
return volumesData.([]map[string]any)
|
||||
}
|
Reference in New Issue
Block a user