feat(coderd/notifications): group workspace build failure report (#17306)

Closes https://github.com/coder/coder/issues/15745

Instead of sending X many reports to a single template admin, we instead
send only 1.
This commit is contained in:
Danielle Maywood
2025-04-10 13:32:19 +01:00
committed by GitHub
parent 33b9487899
commit 6dd1056025
10 changed files with 552 additions and 246 deletions

View File

@ -18,6 +18,7 @@ import (
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/util/slice"
"github.com/coder/coder/v2/codersdk"
)
@ -102,6 +103,11 @@ const (
failedWorkspaceBuildsReportFrequencyLabel = "week"
)
type adminReport struct {
stats database.GetWorkspaceBuildStatsByTemplatesRow
failedBuilds []database.GetFailedWorkspaceBuildsByTemplateIDRow
}
func reportFailedWorkspaceBuilds(ctx context.Context, logger slog.Logger, db database.Store, enqueuer notifications.Enqueuer, clk quartz.Clock) error {
now := clk.Now()
since := now.Add(-failedWorkspaceBuildsReportFrequency)
@ -136,6 +142,8 @@ func reportFailedWorkspaceBuilds(ctx context.Context, logger slog.Logger, db dat
return xerrors.Errorf("unable to fetch failed workspace builds: %w", err)
}
reports := make(map[uuid.UUID][]adminReport)
for _, stats := range templateStatsRows {
select {
case <-ctx.Done():
@ -165,33 +173,40 @@ func reportFailedWorkspaceBuilds(ctx context.Context, logger slog.Logger, db dat
logger.Error(ctx, "unable to fetch failed workspace builds", slog.F("template_id", stats.TemplateID), slog.Error(err))
continue
}
reportData := buildDataForReportFailedWorkspaceBuilds(stats, failedBuilds)
// Send reports to template admins
templateDisplayName := stats.TemplateDisplayName
if templateDisplayName == "" {
templateDisplayName = stats.TemplateName
}
for _, templateAdmin := range templateAdmins {
select {
case <-ctx.Done():
logger.Debug(ctx, "context is canceled, quitting", slog.Error(ctx.Err()))
break
default:
}
adminReports := reports[templateAdmin.ID]
adminReports = append(adminReports, adminReport{
failedBuilds: failedBuilds,
stats: stats,
})
if _, err := enqueuer.EnqueueWithData(ctx, templateAdmin.ID, notifications.TemplateWorkspaceBuildsFailedReport,
map[string]string{
"template_name": stats.TemplateName,
"template_display_name": templateDisplayName,
},
reportData,
"report_generator",
stats.TemplateID, stats.TemplateOrganizationID,
); err != nil {
logger.Warn(ctx, "failed to send a report with failed workspace builds", slog.Error(err))
}
reports[templateAdmin.ID] = adminReports
}
}
for templateAdmin, reports := range reports {
select {
case <-ctx.Done():
logger.Debug(ctx, "context is canceled, quitting", slog.Error(ctx.Err()))
break
default:
}
reportData := buildDataForReportFailedWorkspaceBuilds(reports)
targets := []uuid.UUID{}
for _, report := range reports {
targets = append(targets, report.stats.TemplateID, report.stats.TemplateOrganizationID)
}
if _, err := enqueuer.EnqueueWithData(ctx, templateAdmin, notifications.TemplateWorkspaceBuildsFailedReport,
map[string]string{},
reportData,
"report_generator",
slice.Unique(targets)...,
); err != nil {
logger.Warn(ctx, "failed to send a report with failed workspace builds", slog.Error(err))
}
}
@ -213,54 +228,71 @@ func reportFailedWorkspaceBuilds(ctx context.Context, logger slog.Logger, db dat
const workspaceBuildsLimitPerTemplateVersion = 10
func buildDataForReportFailedWorkspaceBuilds(stats database.GetWorkspaceBuildStatsByTemplatesRow, failedBuilds []database.GetFailedWorkspaceBuildsByTemplateIDRow) map[string]any {
// Build notification model for template versions and failed workspace builds.
//
// Failed builds are sorted by template version ascending, workspace build number descending.
// Review builds, group them by template versions, and assign to builds to template versions.
// The map requires `[]map[string]any{}` to be compatible with data passed to `NotificationEnqueuer`.
templateVersions := []map[string]any{}
for _, failedBuild := range failedBuilds {
c := len(templateVersions)
func buildDataForReportFailedWorkspaceBuilds(reports []adminReport) map[string]any {
templates := []map[string]any{}
if c == 0 || templateVersions[c-1]["template_version_name"] != failedBuild.TemplateVersionName {
templateVersions = append(templateVersions, map[string]any{
"template_version_name": failedBuild.TemplateVersionName,
"failed_count": 1,
"failed_builds": []map[string]any{
{
"workspace_owner_username": failedBuild.WorkspaceOwnerUsername,
"workspace_name": failedBuild.WorkspaceName,
"build_number": failedBuild.WorkspaceBuildNumber,
for _, report := range reports {
// Build notification model for template versions and failed workspace builds.
//
// Failed builds are sorted by template version ascending, workspace build number descending.
// Review builds, group them by template versions, and assign to builds to template versions.
// The map requires `[]map[string]any{}` to be compatible with data passed to `NotificationEnqueuer`.
templateVersions := []map[string]any{}
for _, failedBuild := range report.failedBuilds {
c := len(templateVersions)
if c == 0 || templateVersions[c-1]["template_version_name"] != failedBuild.TemplateVersionName {
templateVersions = append(templateVersions, map[string]any{
"template_version_name": failedBuild.TemplateVersionName,
"failed_count": 1,
"failed_builds": []map[string]any{
{
"workspace_owner_username": failedBuild.WorkspaceOwnerUsername,
"workspace_name": failedBuild.WorkspaceName,
"workspace_id": failedBuild.WorkspaceID,
"build_number": failedBuild.WorkspaceBuildNumber,
},
},
},
})
continue
})
continue
}
tv := templateVersions[c-1]
//nolint:errorlint,forcetypeassert // only this function prepares the notification model
tv["failed_count"] = tv["failed_count"].(int) + 1
//nolint:errorlint,forcetypeassert // only this function prepares the notification model
builds := tv["failed_builds"].([]map[string]any)
if len(builds) < workspaceBuildsLimitPerTemplateVersion {
// return N last builds to prevent long email reports
builds = append(builds, map[string]any{
"workspace_owner_username": failedBuild.WorkspaceOwnerUsername,
"workspace_name": failedBuild.WorkspaceName,
"workspace_id": failedBuild.WorkspaceID,
"build_number": failedBuild.WorkspaceBuildNumber,
})
tv["failed_builds"] = builds
}
templateVersions[c-1] = tv
}
tv := templateVersions[c-1]
//nolint:errorlint,forcetypeassert // only this function prepares the notification model
tv["failed_count"] = tv["failed_count"].(int) + 1
//nolint:errorlint,forcetypeassert // only this function prepares the notification model
builds := tv["failed_builds"].([]map[string]any)
if len(builds) < workspaceBuildsLimitPerTemplateVersion {
// return N last builds to prevent long email reports
builds = append(builds, map[string]any{
"workspace_owner_username": failedBuild.WorkspaceOwnerUsername,
"workspace_name": failedBuild.WorkspaceName,
"build_number": failedBuild.WorkspaceBuildNumber,
})
tv["failed_builds"] = builds
templateDisplayName := report.stats.TemplateDisplayName
if templateDisplayName == "" {
templateDisplayName = report.stats.TemplateName
}
templateVersions[c-1] = tv
templates = append(templates, map[string]any{
"failed_builds": report.stats.FailedBuilds,
"total_builds": report.stats.TotalBuilds,
"versions": templateVersions,
"name": report.stats.TemplateName,
"display_name": templateDisplayName,
})
}
return map[string]any{
"failed_builds": stats.FailedBuilds,
"total_builds": stats.TotalBuilds,
"report_frequency": failedWorkspaceBuildsReportFrequencyLabel,
"template_versions": templateVersions,
"report_frequency": failedWorkspaceBuildsReportFrequencyLabel,
"templates": templates,
}
}

View File

@ -3,6 +3,7 @@ package reports
import (
"context"
"database/sql"
"sort"
"testing"
"time"
@ -118,17 +119,13 @@ func TestReportFailedWorkspaceBuilds(t *testing.T) {
t.Run("FailedBuilds_SecondRun_Report_ThirdRunTooEarly_NoReport_FourthRun_Report", func(t *testing.T) {
t.Parallel()
verifyNotification := func(t *testing.T, recipient database.User, notif *notificationstest.FakeNotification, tmpl database.Template, failedBuilds, totalBuilds int64, templateVersions []map[string]interface{}) {
verifyNotification := func(t *testing.T, recipientID uuid.UUID, notif *notificationstest.FakeNotification, templates []map[string]any) {
t.Helper()
require.Equal(t, recipient.ID, notif.UserID)
require.Equal(t, recipientID, notif.UserID)
require.Equal(t, notifications.TemplateWorkspaceBuildsFailedReport, notif.TemplateID)
require.Equal(t, tmpl.Name, notif.Labels["template_name"])
require.Equal(t, tmpl.DisplayName, notif.Labels["template_display_name"])
require.Equal(t, failedBuilds, notif.Data["failed_builds"])
require.Equal(t, totalBuilds, notif.Data["total_builds"])
require.Equal(t, "week", notif.Data["report_frequency"])
require.Equal(t, templateVersions, notif.Data["template_versions"])
require.Equal(t, templates, notif.Data["templates"])
}
// Setup
@ -212,43 +209,65 @@ func TestReportFailedWorkspaceBuilds(t *testing.T) {
require.NoError(t, err)
sent := notifEnq.Sent()
require.Len(t, sent, 4) // 2 templates, 2 template admins
for i, templateAdmin := range []database.User{templateAdmin1, templateAdmin2} {
verifyNotification(t, templateAdmin, sent[i], t1, 3, 4, []map[string]interface{}{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(7), "workspace_name": w3.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(1), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 2,
"template_version_name": t1v1.Name,
},
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(3), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 1,
"template_version_name": t1v2.Name,
},
})
}
require.Len(t, sent, 2) // 2 templates, 2 template admins
for i, templateAdmin := range []database.User{templateAdmin1, templateAdmin2} {
verifyNotification(t, templateAdmin, sent[i+2], t2, 3, 5, []map[string]interface{}{
templateAdmins := []uuid.UUID{templateAdmin1.ID, templateAdmin2.ID}
// Ensure consistent order for tests
sort.Slice(templateAdmins, func(i, j int) bool {
return templateAdmins[i].String() < templateAdmins[j].String()
})
sort.Slice(sent, func(i, j int) bool {
return sent[i].UserID.String() < sent[j].UserID.String()
})
for i, templateAdmin := range templateAdmins {
verifyNotification(t, templateAdmin, sent[i], []map[string]any{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(8), "workspace_name": w4.Name, "workspace_owner_username": user2.Username},
"name": t1.Name,
"display_name": t1.DisplayName,
"failed_builds": int64(3),
"total_builds": int64(4),
"versions": []map[string]any{
{
"failed_builds": []map[string]any{
{"build_number": int32(7), "workspace_name": w3.Name, "workspace_id": w3.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(1), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
},
"failed_count": 2,
"template_version_name": t1v1.Name,
},
{
"failed_builds": []map[string]any{
{"build_number": int32(3), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
},
"failed_count": 1,
"template_version_name": t1v2.Name,
},
},
"failed_count": 1,
"template_version_name": t2v1.Name,
},
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(6), "workspace_name": w2.Name, "workspace_owner_username": user2.Username},
{"build_number": int32(5), "workspace_name": w2.Name, "workspace_owner_username": user2.Username},
"name": t2.Name,
"display_name": t2.DisplayName,
"failed_builds": int64(3),
"total_builds": int64(5),
"versions": []map[string]any{
{
"failed_builds": []map[string]any{
{"build_number": int32(8), "workspace_name": w4.Name, "workspace_id": w4.ID, "workspace_owner_username": user2.Username},
},
"failed_count": 1,
"template_version_name": t2v1.Name,
},
{
"failed_builds": []map[string]any{
{"build_number": int32(6), "workspace_name": w2.Name, "workspace_id": w2.ID, "workspace_owner_username": user2.Username},
{"build_number": int32(5), "workspace_name": w2.Name, "workspace_id": w2.ID, "workspace_owner_username": user2.Username},
},
"failed_count": 2,
"template_version_name": t2v2.Name,
},
},
"failed_count": 2,
"template_version_name": t2v2.Name,
},
})
}
@ -279,14 +298,33 @@ func TestReportFailedWorkspaceBuilds(t *testing.T) {
// Then: we should see the failed job in the report
sent = notifEnq.Sent()
require.Len(t, sent, 2) // a new failed job should be reported
for i, templateAdmin := range []database.User{templateAdmin1, templateAdmin2} {
verifyNotification(t, templateAdmin, sent[i], t1, 1, 1, []map[string]interface{}{
templateAdmins = []uuid.UUID{templateAdmin1.ID, templateAdmin2.ID}
// Ensure consistent order for tests
sort.Slice(templateAdmins, func(i, j int) bool {
return templateAdmins[i].String() < templateAdmins[j].String()
})
sort.Slice(sent, func(i, j int) bool {
return sent[i].UserID.String() < sent[j].UserID.String()
})
for i, templateAdmin := range templateAdmins {
verifyNotification(t, templateAdmin, sent[i], []map[string]any{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(77), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
"name": t1.Name,
"display_name": t1.DisplayName,
"failed_builds": int64(1),
"total_builds": int64(1),
"versions": []map[string]any{
{
"failed_builds": []map[string]any{
{"build_number": int32(77), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
},
"failed_count": 1,
"template_version_name": t1v2.Name,
},
},
"failed_count": 1,
"template_version_name": t1v2.Name,
},
})
}
@ -295,17 +333,13 @@ func TestReportFailedWorkspaceBuilds(t *testing.T) {
t.Run("TooManyFailedBuilds_SecondRun_Report", func(t *testing.T) {
t.Parallel()
verifyNotification := func(t *testing.T, recipient database.User, notif *notificationstest.FakeNotification, tmpl database.Template, failedBuilds, totalBuilds int64, templateVersions []map[string]interface{}) {
verifyNotification := func(t *testing.T, recipient database.User, notif *notificationstest.FakeNotification, templates []map[string]any) {
t.Helper()
require.Equal(t, recipient.ID, notif.UserID)
require.Equal(t, notifications.TemplateWorkspaceBuildsFailedReport, notif.TemplateID)
require.Equal(t, tmpl.Name, notif.Labels["template_name"])
require.Equal(t, tmpl.DisplayName, notif.Labels["template_display_name"])
require.Equal(t, failedBuilds, notif.Data["failed_builds"])
require.Equal(t, totalBuilds, notif.Data["total_builds"])
require.Equal(t, "week", notif.Data["report_frequency"])
require.Equal(t, templateVersions, notif.Data["template_versions"])
require.Equal(t, templates, notif.Data["templates"])
}
// Setup
@ -369,38 +403,46 @@ func TestReportFailedWorkspaceBuilds(t *testing.T) {
sent := notifEnq.Sent()
require.Len(t, sent, 1) // 1 template, 1 template admin
verifyNotification(t, templateAdmin1, sent[0], t1, 46, 47, []map[string]interface{}{
verifyNotification(t, templateAdmin1, sent[0], []map[string]any{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(23), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(22), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(21), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(20), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(19), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(18), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(17), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(16), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(15), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(14), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
"name": t1.Name,
"display_name": t1.DisplayName,
"failed_builds": int64(46),
"total_builds": int64(47),
"versions": []map[string]any{
{
"failed_builds": []map[string]any{
{"build_number": int32(23), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(22), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(21), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(20), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(19), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(18), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(17), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(16), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(15), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(14), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
},
"failed_count": 23,
"template_version_name": t1v1.Name,
},
{
"failed_builds": []map[string]any{
{"build_number": int32(123), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(122), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(121), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(120), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(119), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(118), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(117), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(116), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(115), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
{"build_number": int32(114), "workspace_name": w1.Name, "workspace_id": w1.ID, "workspace_owner_username": user1.Username},
},
"failed_count": 23,
"template_version_name": t1v2.Name,
},
},
"failed_count": 23,
"template_version_name": t1v1.Name,
},
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(123), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(122), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(121), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(120), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(119), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(118), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(117), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(116), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(115), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(114), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 23,
"template_version_name": t1v2.Name,
},
})
})