Files
coder/coderd/notifications/metrics_test.go
Hugo Dutka 6ed76921dd chore: fix windows postgres tests (#15593)
Patches tests that caused Windows Postgres CI in
https://github.com/coder/coder/pull/15520 to consistently fail.

I tested this by temporarily adding Postgres Windows CI to this PR.
However, I reverted those changes to merge them with
https://github.com/coder/coder/pull/15520. For reference, here's [a
passing CI
run](https://github.com/coder/coder/actions/runs/11918816662/job/33219786238)
from an earlier commit.

**Note:** Although Windows tests now pass, they remain quite flaky. I
recommend running Postgres Windows CI to gather data on these flakes,
but I don’t think it should be a required job just yet.
2024-11-20 13:30:31 +01:00

550 lines
20 KiB
Go

package notifications_test
import (
"context"
"runtime"
"strconv"
"sync"
"testing"
"text/template"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
promtest "github.com/prometheus/client_golang/prometheus/testutil"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/coder/quartz"
"github.com/coder/serpent"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/notifications/dispatch"
"github.com/coder/coder/v2/coderd/notifications/types"
"github.com/coder/coder/v2/testutil"
)
func TestMetrics(t *testing.T) {
t.Parallel()
// SETUP
if !dbtestutil.WillUsePostgres() {
t.Skip("This test requires postgres; it relies on business-logic only implemented in the database")
}
// nolint:gocritic // Unit test.
ctx := dbauthz.AsSystemRestricted(testutil.Context(t, testutil.WaitSuperLong))
store, _ := dbtestutil.NewDB(t)
logger := testutil.Logger(t)
reg := prometheus.NewRegistry()
metrics := notifications.NewMetrics(reg)
tmpl := notifications.TemplateWorkspaceDeleted
const (
method = database.NotificationMethodSmtp
maxAttempts = 3
debug = false
)
// GIVEN: a notification manager whose intervals are tuned low (for test speed) and whose dispatches are intercepted
cfg := defaultNotificationsConfig(method)
cfg.MaxSendAttempts = maxAttempts
// Tune the intervals low to increase test speed.
cfg.FetchInterval = serpent.Duration(time.Millisecond * 50)
cfg.RetryInterval = serpent.Duration(time.Millisecond * 50)
cfg.StoreSyncInterval = serpent.Duration(time.Millisecond * 100) // Twice as long as fetch interval to ensure we catch pending updates.
mgr, err := notifications.NewManager(cfg, store, defaultHelpers(), metrics, logger.Named("manager"))
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, mgr.Stop(ctx))
})
handler := &fakeHandler{}
mgr.WithHandlers(map[database.NotificationMethod]notifications.Handler{
method: handler,
})
enq, err := notifications.NewStoreEnqueuer(cfg, store, defaultHelpers(), logger.Named("enqueuer"), quartz.NewReal())
require.NoError(t, err)
user := createSampleUser(t, store)
// Build fingerprints for the two different series we expect.
methodTemplateFP := fingerprintLabels(notifications.LabelMethod, string(method), notifications.LabelTemplateID, tmpl.String())
methodFP := fingerprintLabels(notifications.LabelMethod, string(method))
expected := map[string]func(metric *dto.Metric, series string) bool{
"coderd_notifications_dispatch_attempts_total": func(metric *dto.Metric, series string) bool {
// This metric has 3 possible dispositions; find if any of them match first before we check the metric's value.
results := map[string]float64{
notifications.ResultSuccess: 1, // Only 1 successful delivery.
notifications.ResultTempFail: maxAttempts - 1, // 2 temp failures, on the 3rd it'll be marked permanent failure.
notifications.ResultPermFail: 1, // 1 permanent failure after retries exhausted.
}
var match string
for result, val := range results {
seriesFP := fingerprintLabels(notifications.LabelMethod, string(method), notifications.LabelTemplateID, tmpl.String(), notifications.LabelResult, result)
if !hasMatchingFingerprint(metric, seriesFP) {
continue
}
match = result
if debug {
t.Logf("coderd_notifications_dispatch_attempts_total{result=%q} == %v: %v", result, val, metric.Counter.GetValue())
}
break
}
// Could not find a matching series.
if match == "" {
assert.Failf(t, "found unexpected series %q", series)
return false
}
// nolint:forcetypeassert // Already checked above.
target := results[match]
return metric.Counter.GetValue() == target
},
"coderd_notifications_retry_count": func(metric *dto.Metric, series string) bool {
assert.Truef(t, hasMatchingFingerprint(metric, methodTemplateFP), "found unexpected series %q", series)
if debug {
t.Logf("coderd_notifications_retry_count == %v: %v", maxAttempts-1, metric.Counter.GetValue())
}
// 1 original attempts + 2 retries = maxAttempts
return metric.Counter.GetValue() == maxAttempts-1
},
"coderd_notifications_queued_seconds": func(metric *dto.Metric, series string) bool {
assert.Truef(t, hasMatchingFingerprint(metric, methodFP), "found unexpected series %q", series)
if debug {
t.Logf("coderd_notifications_queued_seconds > 0: %v", metric.Histogram.GetSampleSum())
}
// This check is extremely flaky on windows. It fails more often than not, but not always.
if runtime.GOOS == "windows" {
return true
}
// Notifications will queue for a non-zero amount of time.
return metric.Histogram.GetSampleSum() > 0
},
"coderd_notifications_dispatcher_send_seconds": func(metric *dto.Metric, series string) bool {
assert.Truef(t, hasMatchingFingerprint(metric, methodFP), "found unexpected series %q", series)
if debug {
t.Logf("coderd_notifications_dispatcher_send_seconds > 0: %v", metric.Histogram.GetSampleSum())
}
// This check is extremely flaky on windows. It fails more often than not, but not always.
if runtime.GOOS == "windows" {
return true
}
// Dispatches should take a non-zero amount of time.
return metric.Histogram.GetSampleSum() > 0
},
"coderd_notifications_inflight_dispatches": func(metric *dto.Metric, series string) bool {
// This is a gauge, so it can be difficult to get the timing right to catch it.
// See TestInflightDispatchesMetric for a more precise test.
return true
},
"coderd_notifications_pending_updates": func(metric *dto.Metric, series string) bool {
// This is a gauge, so it can be difficult to get the timing right to catch it.
// See TestPendingUpdatesMetric for a more precise test.
return true
},
"coderd_notifications_synced_updates_total": func(metric *dto.Metric, series string) bool {
if debug {
t.Logf("coderd_notifications_synced_updates_total = %v: %v", maxAttempts+1, metric.Counter.GetValue())
}
// 1 message will exceed its maxAttempts, 1 will succeed on the first try.
return metric.Counter.GetValue() == maxAttempts+1
},
}
// WHEN: 2 notifications are enqueued, 1 of which will fail until its retries are exhausted, and another which will succeed
_, err = enq.Enqueue(ctx, user.ID, tmpl, map[string]string{"type": "success"}, "test") // this will succeed
require.NoError(t, err)
_, err = enq.Enqueue(ctx, user.ID, tmpl, map[string]string{"type": "failure"}, "test2") // this will fail and retry (maxAttempts - 1) times
require.NoError(t, err)
mgr.Run(ctx)
// THEN: expect all the defined metrics to be present and have their expected values
require.EventuallyWithT(t, func(ct *assert.CollectT) {
handler.mu.RLock()
defer handler.mu.RUnlock()
gathered, err := reg.Gather()
assert.NoError(t, err)
succeeded := len(handler.succeeded)
failed := len(handler.failed)
if debug {
t.Logf("SUCCEEDED == 1: %v, FAILED == %v: %v\n", succeeded, maxAttempts, failed)
}
// Ensure that all metrics have a) the expected label combinations (series) and b) the expected values.
for _, family := range gathered {
hasExpectedValue, ok := expected[family.GetName()]
if !assert.Truef(ct, ok, "found unexpected metric family %q", family.GetName()) {
t.Logf("found unexpected metric family %q", family.GetName())
// Bail out fast if precondition is not met.
ct.FailNow()
}
for _, metric := range family.Metric {
assert.True(ct, hasExpectedValue(metric, metric.String()))
}
}
// One message will succeed.
assert.Equal(ct, succeeded, 1)
// One message will fail, and exhaust its maxAttempts.
assert.Equal(ct, failed, maxAttempts)
}, testutil.WaitShort, testutil.IntervalFast)
}
func TestPendingUpdatesMetric(t *testing.T) {
t.Parallel()
// SETUP
// nolint:gocritic // Unit test.
ctx := dbauthz.AsSystemRestricted(testutil.Context(t, testutil.WaitSuperLong))
store, _ := dbtestutil.NewDB(t)
logger := testutil.Logger(t)
reg := prometheus.NewRegistry()
metrics := notifications.NewMetrics(reg)
tmpl := notifications.TemplateWorkspaceDeleted
const method = database.NotificationMethodSmtp
// GIVEN: a notification manager whose store updates are intercepted so we can read the number of pending updates set in the metric
cfg := defaultNotificationsConfig(method)
cfg.RetryInterval = serpent.Duration(time.Hour) // Delay retries so they don't interfere.
cfg.FetchInterval = serpent.Duration(time.Millisecond * 50)
cfg.StoreSyncInterval = serpent.Duration(time.Millisecond * 100)
syncer := &syncInterceptor{Store: store}
interceptor := newUpdateSignallingInterceptor(syncer)
mClock := quartz.NewMock(t)
trap := mClock.Trap().NewTicker("Manager", "storeSync")
defer trap.Close()
fetchTrap := mClock.Trap().TickerFunc("notifier", "fetchInterval")
defer fetchTrap.Close()
mgr, err := notifications.NewManager(cfg, interceptor, defaultHelpers(), metrics, logger.Named("manager"),
notifications.WithTestClock(mClock))
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, mgr.Stop(ctx))
})
handler := &fakeHandler{}
mgr.WithHandlers(map[database.NotificationMethod]notifications.Handler{
method: handler,
})
enq, err := notifications.NewStoreEnqueuer(cfg, store, defaultHelpers(), logger.Named("enqueuer"), quartz.NewReal())
require.NoError(t, err)
user := createSampleUser(t, store)
// WHEN: 2 notifications are enqueued, one of which will fail and one which will succeed
_, err = enq.Enqueue(ctx, user.ID, tmpl, map[string]string{"type": "success"}, "test") // this will succeed
require.NoError(t, err)
_, err = enq.Enqueue(ctx, user.ID, tmpl, map[string]string{"type": "failure"}, "test2") // this will fail and retry (maxAttempts - 1) times
require.NoError(t, err)
mgr.Run(ctx)
trap.MustWait(ctx).Release() // ensures ticker has been set
fetchTrap.MustWait(ctx).Release()
// Advance to the first fetch
mClock.Advance(cfg.FetchInterval.Value()).MustWait(ctx)
// THEN:
// handler has dispatched the given notifications.
func() {
handler.mu.RLock()
defer handler.mu.RUnlock()
require.Len(t, handler.succeeded, 1)
require.Len(t, handler.failed, 1)
}()
// Both handler calls should be pending in the metrics.
require.EqualValues(t, 2, promtest.ToFloat64(metrics.PendingUpdates))
// THEN:
// Trigger syncing updates
mClock.Advance(cfg.StoreSyncInterval.Value() - cfg.FetchInterval.Value()).MustWait(ctx)
// Wait until we intercept the calls to sync the pending updates to the store.
success := testutil.RequireRecvCtx(testutil.Context(t, testutil.WaitShort), t, interceptor.updateSuccess)
require.EqualValues(t, 1, success)
failure := testutil.RequireRecvCtx(testutil.Context(t, testutil.WaitShort), t, interceptor.updateFailure)
require.EqualValues(t, 1, failure)
// Validate that the store synced the expected number of updates.
require.Eventually(t, func() bool {
return syncer.sent.Load() == 1 && syncer.failed.Load() == 1
}, testutil.WaitShort, testutil.IntervalFast)
// Wait for the updates to be synced and the metric to reflect that.
require.Eventually(t, func() bool {
return promtest.ToFloat64(metrics.PendingUpdates) == 0
}, testutil.WaitShort, testutil.IntervalFast)
}
func TestInflightDispatchesMetric(t *testing.T) {
t.Parallel()
// SETUP
// nolint:gocritic // Unit test.
ctx := dbauthz.AsSystemRestricted(testutil.Context(t, testutil.WaitSuperLong))
store, _ := dbtestutil.NewDB(t)
logger := testutil.Logger(t)
reg := prometheus.NewRegistry()
metrics := notifications.NewMetrics(reg)
tmpl := notifications.TemplateWorkspaceDeleted
const method = database.NotificationMethodSmtp
// GIVEN: a notification manager whose dispatches are intercepted and delayed to measure the number of inflight requests
cfg := defaultNotificationsConfig(method)
cfg.LeaseCount = 10
cfg.FetchInterval = serpent.Duration(time.Millisecond * 50)
cfg.RetryInterval = serpent.Duration(time.Hour) // Delay retries so they don't interfere.
cfg.StoreSyncInterval = serpent.Duration(time.Millisecond * 100)
mgr, err := notifications.NewManager(cfg, store, defaultHelpers(), metrics, logger.Named("manager"))
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, mgr.Stop(ctx))
})
handler := &fakeHandler{}
const msgCount = 2
// Barrier handler will wait until all notification messages are in-flight.
barrier := newBarrierHandler(msgCount, handler)
mgr.WithHandlers(map[database.NotificationMethod]notifications.Handler{
method: barrier,
})
enq, err := notifications.NewStoreEnqueuer(cfg, store, defaultHelpers(), logger.Named("enqueuer"), quartz.NewReal())
require.NoError(t, err)
user := createSampleUser(t, store)
// WHEN: notifications are enqueued which will succeed (and be delayed during dispatch)
for i := 0; i < msgCount; i++ {
_, err = enq.Enqueue(ctx, user.ID, tmpl, map[string]string{"type": "success", "i": strconv.Itoa(i)}, "test")
require.NoError(t, err)
}
mgr.Run(ctx)
// THEN:
// Ensure we see the dispatches of the messages inflight.
require.Eventually(t, func() bool {
return promtest.ToFloat64(metrics.InflightDispatches.WithLabelValues(string(method), tmpl.String())) == msgCount
}, testutil.WaitShort, testutil.IntervalFast)
for i := 0; i < msgCount; i++ {
barrier.wg.Done()
}
// Wait until the handler has dispatched the given notifications.
require.Eventually(t, func() bool {
handler.mu.RLock()
defer handler.mu.RUnlock()
return len(handler.succeeded) == msgCount
}, testutil.WaitShort, testutil.IntervalFast)
// Wait for the updates to be synced and the metric to reflect that.
require.Eventually(t, func() bool {
return promtest.ToFloat64(metrics.InflightDispatches) == 0
}, testutil.WaitShort, testutil.IntervalFast)
}
func TestCustomMethodMetricCollection(t *testing.T) {
t.Parallel()
// SETUP
if !dbtestutil.WillUsePostgres() {
// UpdateNotificationTemplateMethodByID only makes sense with a real database.
t.Skip("This test requires postgres; it relies on business-logic only implemented in the database")
}
// nolint:gocritic // Unit test.
ctx := dbauthz.AsSystemRestricted(testutil.Context(t, testutil.WaitSuperLong))
store, _ := dbtestutil.NewDB(t)
logger := testutil.Logger(t)
var (
reg = prometheus.NewRegistry()
metrics = notifications.NewMetrics(reg)
tmpl = notifications.TemplateWorkspaceDeleted
anotherTemplate = notifications.TemplateWorkspaceDormant
)
const (
customMethod = database.NotificationMethodWebhook
defaultMethod = database.NotificationMethodSmtp
)
// GIVEN: a template whose notification method differs from the default.
out, err := store.UpdateNotificationTemplateMethodByID(ctx, database.UpdateNotificationTemplateMethodByIDParams{
ID: tmpl,
Method: database.NullNotificationMethod{NotificationMethod: customMethod, Valid: true},
})
require.NoError(t, err)
require.Equal(t, customMethod, out.Method.NotificationMethod)
// WHEN: two notifications (each with different templates) are enqueued.
cfg := defaultNotificationsConfig(defaultMethod)
mgr, err := notifications.NewManager(cfg, store, defaultHelpers(), metrics, logger.Named("manager"))
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, mgr.Stop(ctx))
})
smtpHandler := &fakeHandler{}
webhookHandler := &fakeHandler{}
mgr.WithHandlers(map[database.NotificationMethod]notifications.Handler{
defaultMethod: smtpHandler,
customMethod: webhookHandler,
})
enq, err := notifications.NewStoreEnqueuer(cfg, store, defaultHelpers(), logger.Named("enqueuer"), quartz.NewReal())
require.NoError(t, err)
user := createSampleUser(t, store)
_, err = enq.Enqueue(ctx, user.ID, tmpl, map[string]string{"type": "success"}, "test")
require.NoError(t, err)
_, err = enq.Enqueue(ctx, user.ID, anotherTemplate, map[string]string{"type": "success"}, "test")
require.NoError(t, err)
mgr.Run(ctx)
// THEN: the fake handlers to "dispatch" the notifications.
require.Eventually(t, func() bool {
smtpHandler.mu.RLock()
webhookHandler.mu.RLock()
defer smtpHandler.mu.RUnlock()
defer webhookHandler.mu.RUnlock()
return len(smtpHandler.succeeded) == 1 && len(smtpHandler.failed) == 0 &&
len(webhookHandler.succeeded) == 1 && len(webhookHandler.failed) == 0
}, testutil.WaitShort, testutil.IntervalFast)
// THEN: we should have metric series for both the default and custom notification methods.
require.Eventually(t, func() bool {
return promtest.ToFloat64(metrics.DispatchAttempts.WithLabelValues(string(defaultMethod), anotherTemplate.String(), notifications.ResultSuccess)) > 0 &&
promtest.ToFloat64(metrics.DispatchAttempts.WithLabelValues(string(customMethod), tmpl.String(), notifications.ResultSuccess)) > 0
}, testutil.WaitShort, testutil.IntervalFast)
}
// hasMatchingFingerprint checks if the given metric's series fingerprint matches the reference fingerprint.
func hasMatchingFingerprint(metric *dto.Metric, fp model.Fingerprint) bool {
return fingerprintLabelPairs(metric.Label) == fp
}
// fingerprintLabelPairs produces a fingerprint unique to the given combination of label pairs.
func fingerprintLabelPairs(lbs []*dto.LabelPair) model.Fingerprint {
pairs := make([]string, 0, len(lbs)*2)
for _, lp := range lbs {
pairs = append(pairs, lp.GetName(), lp.GetValue())
}
return fingerprintLabels(pairs...)
}
// fingerprintLabels produces a fingerprint unique to the given pairs of label values.
// MUST contain an even number of arguments (key:value), otherwise it will panic.
func fingerprintLabels(lbs ...string) model.Fingerprint {
if len(lbs)%2 != 0 {
panic("imbalanced set of label pairs given")
}
lbsSet := make(model.LabelSet, len(lbs)/2)
for i := 0; i < len(lbs); i += 2 {
k := lbs[i]
v := lbs[i+1]
lbsSet[model.LabelName(k)] = model.LabelValue(v)
}
return lbsSet.Fingerprint() // FastFingerprint does not sort the labels.
}
// updateSignallingInterceptor intercepts bulk update calls to the store, and waits on the "proceed" condition to be
// signaled by the caller so it can continue.
type updateSignallingInterceptor struct {
notifications.Store
updateSuccess chan int
updateFailure chan int
}
func newUpdateSignallingInterceptor(interceptor notifications.Store) *updateSignallingInterceptor {
return &updateSignallingInterceptor{
Store: interceptor,
updateSuccess: make(chan int, 1),
updateFailure: make(chan int, 1),
}
}
func (u *updateSignallingInterceptor) BulkMarkNotificationMessagesSent(ctx context.Context, arg database.BulkMarkNotificationMessagesSentParams) (int64, error) {
u.updateSuccess <- len(arg.IDs)
return u.Store.BulkMarkNotificationMessagesSent(ctx, arg)
}
func (u *updateSignallingInterceptor) BulkMarkNotificationMessagesFailed(ctx context.Context, arg database.BulkMarkNotificationMessagesFailedParams) (int64, error) {
u.updateFailure <- len(arg.IDs)
return u.Store.BulkMarkNotificationMessagesFailed(ctx, arg)
}
type barrierHandler struct {
h notifications.Handler
wg *sync.WaitGroup
}
func newBarrierHandler(total int, handler notifications.Handler) *barrierHandler {
var wg sync.WaitGroup
wg.Add(total)
return &barrierHandler{
h: handler,
wg: &wg,
}
}
func (bh *barrierHandler) Dispatcher(payload types.MessagePayload, title, body string, helpers template.FuncMap) (dispatch.DeliveryFunc, error) {
deliverFn, err := bh.h.Dispatcher(payload, title, body, helpers)
if err != nil {
return nil, err
}
return func(ctx context.Context, msgID uuid.UUID) (retryable bool, err error) {
bh.wg.Wait()
return deliverFn(ctx, msgID)
}, nil
}