mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
945 lines
31 KiB
Go
945 lines
31 KiB
Go
package agentapi_test
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/stretchr/testify/require"
|
|
"google.golang.org/protobuf/types/known/timestamppb"
|
|
|
|
agentproto "github.com/coder/coder/v2/agent/proto"
|
|
"github.com/coder/coder/v2/coderd/agentapi"
|
|
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbgen"
|
|
"github.com/coder/coder/v2/coderd/database/dbtestutil"
|
|
"github.com/coder/coder/v2/coderd/notifications"
|
|
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
func resourceMonitorAPI(t *testing.T) (*agentapi.ResourcesMonitoringAPI, database.User, *quartz.Mock, *notificationstest.FakeEnqueuer) {
|
|
t.Helper()
|
|
|
|
db, _ := dbtestutil.NewDB(t)
|
|
user := dbgen.User(t, db, database.User{})
|
|
org := dbgen.Organization(t, db, database.Organization{})
|
|
template := dbgen.Template(t, db, database.Template{
|
|
OrganizationID: org.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
|
|
TemplateID: uuid.NullUUID{Valid: true, UUID: template.ID},
|
|
OrganizationID: org.ID,
|
|
CreatedBy: user.ID,
|
|
})
|
|
workspace := dbgen.Workspace(t, db, database.WorkspaceTable{
|
|
OrganizationID: org.ID,
|
|
TemplateID: template.ID,
|
|
OwnerID: user.ID,
|
|
})
|
|
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
|
|
Type: database.ProvisionerJobTypeWorkspaceBuild,
|
|
})
|
|
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
|
|
JobID: job.ID,
|
|
WorkspaceID: workspace.ID,
|
|
TemplateVersionID: templateVersion.ID,
|
|
})
|
|
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
|
|
JobID: build.JobID,
|
|
})
|
|
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
|
|
ResourceID: resource.ID,
|
|
})
|
|
|
|
notifyEnq := ¬ificationstest.FakeEnqueuer{}
|
|
clock := quartz.NewMock(t)
|
|
|
|
return &agentapi.ResourcesMonitoringAPI{
|
|
AgentID: agent.ID,
|
|
WorkspaceID: workspace.ID,
|
|
Clock: clock,
|
|
Database: db,
|
|
NotificationsEnqueuer: notifyEnq,
|
|
Config: resourcesmonitor.Config{
|
|
NumDatapoints: 20,
|
|
CollectionInterval: 10 * time.Second,
|
|
|
|
Alert: resourcesmonitor.AlertConfig{
|
|
MinimumNOKsPercent: 20,
|
|
ConsecutiveNOKsPercent: 50,
|
|
},
|
|
},
|
|
Debounce: 1 * time.Minute,
|
|
}, user, clock, notifyEnq
|
|
}
|
|
|
|
func TestMemoryResourceMonitorDebounce(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// This test is a bit of a long one. We're testing that
|
|
// when a monitor goes into an alert state, it doesn't
|
|
// allow another notification to occur until after the
|
|
// debounce period.
|
|
//
|
|
// 1. OK -> NOK |> sends a notification
|
|
// 2. NOK -> OK |> does nothing
|
|
// 3. OK -> NOK |> does nothing due to debounce period
|
|
// 4. NOK -> OK |> does nothing
|
|
// 5. OK -> NOK |> sends a notification as debounce period exceeded
|
|
|
|
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
|
api.Config.Alert.ConsecutiveNOKsPercent = 100
|
|
|
|
// Given: A monitor in an OK state
|
|
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
State: database.WorkspaceAgentMonitorStateOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When: The monitor is given a state that will trigger NOK
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect there to be a notification sent
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
require.Len(t, sent, 1)
|
|
require.Equal(t, user.ID, sent[0].UserID)
|
|
notifyEnq.Clear()
|
|
|
|
// When: The monitor moves to an OK state from NOK
|
|
clock.Advance(api.Debounce / 4)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 1,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect no new notifications
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
require.Len(t, sent, 0)
|
|
notifyEnq.Clear()
|
|
|
|
// When: The monitor moves back to a NOK state before the debounced time.
|
|
clock.Advance(api.Debounce / 4)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect no new notifications (showing the debouncer working)
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
require.Len(t, sent, 0)
|
|
notifyEnq.Clear()
|
|
|
|
// When: The monitor moves back to an OK state from NOK
|
|
clock.Advance(api.Debounce / 4)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 1,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We still expect no new notifications
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
require.Len(t, sent, 0)
|
|
notifyEnq.Clear()
|
|
|
|
// When: The monitor moves back to a NOK state after the debounce period.
|
|
clock.Advance(api.Debounce/4 + 1*time.Second)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect a notification
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
require.Len(t, sent, 1)
|
|
require.Equal(t, user.ID, sent[0].UserID)
|
|
}
|
|
|
|
func TestMemoryResourceMonitor(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
memoryUsage []int64
|
|
memoryTotal int64
|
|
previousState database.WorkspaceAgentMonitorState
|
|
expectState database.WorkspaceAgentMonitorState
|
|
shouldNotify bool
|
|
}{
|
|
{
|
|
name: "WhenOK/NeverExceedsThreshold",
|
|
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenOK/ShouldStayInOK",
|
|
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenOK/ConsecutiveExceedsThreshold",
|
|
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: true,
|
|
},
|
|
{
|
|
name: "WhenOK/MinimumExceedsThreshold",
|
|
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: true,
|
|
},
|
|
{
|
|
name: "WhenNOK/NeverExceedsThreshold",
|
|
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenNOK/ShouldStayInNOK",
|
|
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenNOK/ConsecutiveExceedsThreshold",
|
|
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenNOK/MinimumExceedsThreshold",
|
|
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
|
memoryTotal: 10,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
tt := tt
|
|
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
|
|
|
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.memoryUsage))
|
|
collectedAt := clock.Now()
|
|
for _, usage := range tt.memoryUsage {
|
|
collectedAt = collectedAt.Add(15 * time.Second)
|
|
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
CollectedAt: timestamppb.New(collectedAt),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: usage,
|
|
Total: tt.memoryTotal,
|
|
},
|
|
})
|
|
}
|
|
|
|
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
State: tt.previousState,
|
|
Threshold: 80,
|
|
})
|
|
|
|
clock.Set(collectedAt)
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: datapoints,
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
if tt.shouldNotify {
|
|
require.Len(t, sent, 1)
|
|
require.Equal(t, user.ID, sent[0].UserID)
|
|
} else {
|
|
require.Len(t, sent, 0)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestMemoryResourceMonitorMissingData(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
|
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
|
api.Config.Alert.MinimumNOKsPercent = 100
|
|
|
|
// Given: A monitor in an OK state.
|
|
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
State: database.WorkspaceAgentMonitorStateOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When: A datapoint is missing, surrounded by two NOK datapoints.
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
|
Memory: nil,
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
|
|
require.Len(t, sent, 0)
|
|
|
|
// Then: We expect the monitor to still be in an OK state.
|
|
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
|
require.NoError(t, err)
|
|
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitor.State)
|
|
})
|
|
|
|
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
api, _, clock, _ := resourceMonitorAPI(t)
|
|
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
|
api.Config.Alert.MinimumNOKsPercent = 100
|
|
|
|
// Given: A monitor in a NOK state.
|
|
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
State: database.WorkspaceAgentMonitorStateNOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When: A datapoint is missing, surrounded by two OK datapoints.
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 1,
|
|
Total: 10,
|
|
},
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
|
Memory: nil,
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
|
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
|
|
Used: 1,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect the monitor to still be in a NOK state.
|
|
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
|
require.NoError(t, err)
|
|
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitor.State)
|
|
})
|
|
}
|
|
|
|
func TestVolumeResourceMonitorDebounce(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// This test is an even longer one. We're testing
|
|
// that the debounce logic is independent per
|
|
// volume monitor. We interleave the triggering
|
|
// of each monitor to ensure the debounce logic
|
|
// is monitor independent.
|
|
//
|
|
// First Monitor:
|
|
// 1. OK -> NOK |> sends a notification
|
|
// 2. NOK -> OK |> does nothing
|
|
// 3. OK -> NOK |> does nothing due to debounce period
|
|
// 4. NOK -> OK |> does nothing
|
|
// 5. OK -> NOK |> sends a notification as debounce period exceeded
|
|
// 6. NOK -> OK |> does nothing
|
|
//
|
|
// Second Monitor:
|
|
// 1. OK -> OK |> does nothing
|
|
// 2. OK -> NOK |> sends a notification
|
|
// 3. NOK -> OK |> does nothing
|
|
// 4. OK -> NOK |> does nothing due to debounce period
|
|
// 5. NOK -> OK |> does nothing
|
|
// 6. OK -> NOK |> sends a notification as debounce period exceeded
|
|
//
|
|
|
|
firstVolumePath := "/home/coder"
|
|
secondVolumePath := "/dev/coder"
|
|
|
|
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
|
|
|
// Given:
|
|
// - First monitor in an OK state
|
|
// - Second monitor in an OK state
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: firstVolumePath,
|
|
State: database.WorkspaceAgentMonitorStateOK,
|
|
Threshold: 80,
|
|
})
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: secondVolumePath,
|
|
State: database.WorkspaceAgentMonitorStateNOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When:
|
|
// - First monitor is in a NOK state
|
|
// - Second monitor is in an OK state
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{Volume: firstVolumePath, Used: 10, Total: 10},
|
|
{Volume: secondVolumePath, Used: 1, Total: 10},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then:
|
|
// - We expect a notification from only the first monitor
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 1)
|
|
volumes := requireVolumeData(t, sent[0])
|
|
require.Len(t, volumes, 1)
|
|
require.Equal(t, firstVolumePath, volumes[0]["path"])
|
|
notifyEnq.Clear()
|
|
|
|
// When:
|
|
// - First monitor moves back to OK
|
|
// - Second monitor moves to NOK
|
|
clock.Advance(api.Debounce / 4)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{Volume: firstVolumePath, Used: 1, Total: 10},
|
|
{Volume: secondVolumePath, Used: 10, Total: 10},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then:
|
|
// - We expect a notification from only the second monitor
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 1)
|
|
volumes = requireVolumeData(t, sent[0])
|
|
require.Len(t, volumes, 1)
|
|
require.Equal(t, secondVolumePath, volumes[0]["path"])
|
|
notifyEnq.Clear()
|
|
|
|
// When:
|
|
// - First monitor moves back to NOK before debounce period has ended
|
|
// - Second monitor moves back to OK
|
|
clock.Advance(api.Debounce / 4)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{Volume: firstVolumePath, Used: 10, Total: 10},
|
|
{Volume: secondVolumePath, Used: 1, Total: 10},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then:
|
|
// - We expect no new notifications
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 0)
|
|
notifyEnq.Clear()
|
|
|
|
// When:
|
|
// - First monitor moves back to OK
|
|
// - Second monitor moves back to NOK
|
|
clock.Advance(api.Debounce / 4)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{Volume: firstVolumePath, Used: 1, Total: 10},
|
|
{Volume: secondVolumePath, Used: 10, Total: 10},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then:
|
|
// - We expect no new notifications.
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 0)
|
|
notifyEnq.Clear()
|
|
|
|
// When:
|
|
// - First monitor moves back to a NOK state after the debounce period
|
|
// - Second monitor moves back to OK
|
|
clock.Advance(api.Debounce/4 + 1*time.Second)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{Volume: firstVolumePath, Used: 10, Total: 10},
|
|
{Volume: secondVolumePath, Used: 1, Total: 10},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then:
|
|
// - We expect a notification from only the first monitor
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 1)
|
|
volumes = requireVolumeData(t, sent[0])
|
|
require.Len(t, volumes, 1)
|
|
require.Equal(t, firstVolumePath, volumes[0]["path"])
|
|
notifyEnq.Clear()
|
|
|
|
// When:
|
|
// - First montior moves back to OK
|
|
// - Second monitor moves back to NOK after the debounce period
|
|
clock.Advance(api.Debounce/4 + 1*time.Second)
|
|
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{Volume: firstVolumePath, Used: 1, Total: 10},
|
|
{Volume: secondVolumePath, Used: 10, Total: 10},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then:
|
|
// - We expect a notification from only the second monitor
|
|
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 1)
|
|
volumes = requireVolumeData(t, sent[0])
|
|
require.Len(t, volumes, 1)
|
|
require.Equal(t, secondVolumePath, volumes[0]["path"])
|
|
}
|
|
|
|
func TestVolumeResourceMonitor(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
volumePath string
|
|
volumeUsage []int64
|
|
volumeTotal int64
|
|
thresholdPercent int32
|
|
previousState database.WorkspaceAgentMonitorState
|
|
expectState database.WorkspaceAgentMonitorState
|
|
shouldNotify bool
|
|
}{
|
|
{
|
|
name: "WhenOK/NeverExceedsThreshold",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenOK/ShouldStayInOK",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenOK/ConsecutiveExceedsThreshold",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: true,
|
|
},
|
|
{
|
|
name: "WhenOK/MinimumExceedsThreshold",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: true,
|
|
},
|
|
{
|
|
name: "WhenNOK/NeverExceedsThreshold",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenNOK/ShouldStayInNOK",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenNOK/ConsecutiveExceedsThreshold",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: false,
|
|
},
|
|
{
|
|
name: "WhenNOK/MinimumExceedsThreshold",
|
|
volumePath: "/home/coder",
|
|
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
|
|
volumeTotal: 10,
|
|
thresholdPercent: 80,
|
|
previousState: database.WorkspaceAgentMonitorStateNOK,
|
|
expectState: database.WorkspaceAgentMonitorStateNOK,
|
|
shouldNotify: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
tt := tt
|
|
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
api, user, clock, notifyEnq := resourceMonitorAPI(t)
|
|
|
|
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.volumeUsage))
|
|
collectedAt := clock.Now()
|
|
for _, volumeUsage := range tt.volumeUsage {
|
|
collectedAt = collectedAt.Add(15 * time.Second)
|
|
|
|
volumeDatapoints := []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{
|
|
Volume: tt.volumePath,
|
|
Used: volumeUsage,
|
|
Total: tt.volumeTotal,
|
|
},
|
|
}
|
|
|
|
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
CollectedAt: timestamppb.New(collectedAt),
|
|
Volumes: volumeDatapoints,
|
|
})
|
|
}
|
|
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: tt.volumePath,
|
|
State: tt.previousState,
|
|
Threshold: tt.thresholdPercent,
|
|
})
|
|
|
|
clock.Set(collectedAt)
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: datapoints,
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
if tt.shouldNotify {
|
|
require.Len(t, sent, 1)
|
|
require.Equal(t, user.ID, sent[0].UserID)
|
|
} else {
|
|
require.Len(t, sent, 0)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestVolumeResourceMonitorMultiple(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
|
api.Config.Alert.ConsecutiveNOKsPercent = 100
|
|
|
|
// Given: two different volume resource monitors
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: "/home/coder",
|
|
State: database.WorkspaceAgentMonitorStateOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: "/dev/coder",
|
|
State: database.WorkspaceAgentMonitorStateOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When: both of them move to a NOK state
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{
|
|
Volume: "/home/coder",
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
{
|
|
Volume: "/dev/coder",
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect a notification to alert with information about both
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 1)
|
|
|
|
volumes := requireVolumeData(t, sent[0])
|
|
require.Len(t, volumes, 2)
|
|
require.Equal(t, "/home/coder", volumes[0]["path"])
|
|
require.Equal(t, "/dev/coder", volumes[1]["path"])
|
|
}
|
|
|
|
func TestVolumeResourceMonitorMissingData(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
volumePath := "/home/coder"
|
|
|
|
api, _, clock, notifyEnq := resourceMonitorAPI(t)
|
|
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
|
api.Config.Alert.MinimumNOKsPercent = 100
|
|
|
|
// Given: A monitor in an OK state.
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: volumePath,
|
|
State: database.WorkspaceAgentMonitorStateOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When: A datapoint is missing, surrounded by two NOK datapoints.
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{
|
|
Volume: volumePath,
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{
|
|
Volume: volumePath,
|
|
Used: 10,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
|
|
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
|
|
require.Len(t, sent, 0)
|
|
|
|
// Then: We expect the monitor to still be in an OK state.
|
|
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
|
require.NoError(t, err)
|
|
require.Len(t, monitors, 1)
|
|
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitors[0].State)
|
|
})
|
|
|
|
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
volumePath := "/home/coder"
|
|
|
|
api, _, clock, _ := resourceMonitorAPI(t)
|
|
api.Config.Alert.ConsecutiveNOKsPercent = 50
|
|
api.Config.Alert.MinimumNOKsPercent = 100
|
|
|
|
// Given: A monitor in a NOK state.
|
|
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
|
|
AgentID: api.AgentID,
|
|
Path: volumePath,
|
|
State: database.WorkspaceAgentMonitorStateNOK,
|
|
Threshold: 80,
|
|
})
|
|
|
|
// When: A datapoint is missing, surrounded by two OK datapoints.
|
|
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
|
|
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now()),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{
|
|
Volume: volumePath,
|
|
Used: 1,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
|
|
},
|
|
{
|
|
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
|
|
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
|
|
{
|
|
Volume: volumePath,
|
|
Used: 1,
|
|
Total: 10,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Then: We expect the monitor to still be in a NOK state.
|
|
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
|
|
require.NoError(t, err)
|
|
require.Len(t, monitors, 1)
|
|
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitors[0].State)
|
|
})
|
|
}
|
|
|
|
func requireVolumeData(t *testing.T, notif *notificationstest.FakeNotification) []map[string]any {
|
|
t.Helper()
|
|
|
|
volumesData := notif.Data["volumes"]
|
|
require.IsType(t, []map[string]any{}, volumesData)
|
|
|
|
return volumesData.([]map[string]any)
|
|
}
|