feat: notifications: report failed workspace builds (#14571)

This commit is contained in:
Marcin Tojek
2024-09-18 09:11:44 +02:00
committed by GitHub
parent 1e5438eadb
commit 6de59371ea
29 changed files with 1545 additions and 55 deletions

View File

@ -0,0 +1,300 @@
package reports
import (
"context"
"database/sql"
"io"
"slices"
"sort"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/codersdk"
)
const (
delay = 15 * time.Minute
)
func NewReportGenerator(ctx context.Context, logger slog.Logger, db database.Store, enqueuer notifications.Enqueuer, clk quartz.Clock) io.Closer {
closed := make(chan struct{})
ctx, cancelFunc := context.WithCancel(ctx)
//nolint:gocritic // The system generates periodic reports without direct user input.
ctx = dbauthz.AsSystemRestricted(ctx)
// Start the ticker with the initial delay.
ticker := clk.NewTicker(delay)
ticker.Stop()
doTick := func(start time.Time) {
defer ticker.Reset(delay)
// Start a transaction to grab advisory lock, we don't want to run generator jobs at the same time (multiple replicas).
if err := db.InTx(func(tx database.Store) error {
// Acquire a lock to ensure that only one instance of the generator is running at a time.
ok, err := tx.TryAcquireLock(ctx, database.LockIDNotificationsReportGenerator)
if err != nil {
return xerrors.Errorf("failed to acquire report generator lock: %w", err)
}
if !ok {
logger.Debug(ctx, "unable to acquire lock for generating periodic reports, skipping")
return nil
}
err = reportFailedWorkspaceBuilds(ctx, logger, db, enqueuer, clk)
if err != nil {
return xerrors.Errorf("unable to generate reports with failed workspace builds: %w", err)
}
logger.Info(ctx, "report generator finished", slog.F("duration", clk.Since(start)))
return nil
}, nil); err != nil {
logger.Error(ctx, "failed to generate reports", slog.Error(err))
return
}
}
go func() {
defer close(closed)
defer ticker.Stop()
// Force an initial tick.
doTick(dbtime.Time(clk.Now()).UTC())
for {
select {
case <-ctx.Done():
logger.Debug(ctx, "closing report generator")
return
case tick := <-ticker.C:
ticker.Stop()
doTick(dbtime.Time(tick).UTC())
}
}
}()
return &reportGenerator{
cancel: cancelFunc,
closed: closed,
}
}
type reportGenerator struct {
cancel context.CancelFunc
closed chan struct{}
}
func (i *reportGenerator) Close() error {
i.cancel()
<-i.closed
return nil
}
const (
failedWorkspaceBuildsReportFrequency = 7 * 24 * time.Hour
failedWorkspaceBuildsReportFrequencyLabel = "week"
)
func reportFailedWorkspaceBuilds(ctx context.Context, logger slog.Logger, db database.Store, enqueuer notifications.Enqueuer, clk quartz.Clock) error {
now := clk.Now()
since := now.Add(-failedWorkspaceBuildsReportFrequency)
// Firstly, check if this is the first run of the job ever
reportLog, err := db.GetNotificationReportGeneratorLogByTemplate(ctx, notifications.TemplateWorkspaceBuildsFailedReport)
if err != nil && !xerrors.Is(err, sql.ErrNoRows) {
return xerrors.Errorf("unable to read report generator log: %w", err)
}
if xerrors.Is(err, sql.ErrNoRows) {
// First run? Check-in the job, and get back after one week.
logger.Info(ctx, "report generator is executing the job for the first time", slog.F("notification_template_id", notifications.TemplateWorkspaceBuildsFailedReport))
err = db.UpsertNotificationReportGeneratorLog(ctx, database.UpsertNotificationReportGeneratorLogParams{
NotificationTemplateID: notifications.TemplateWorkspaceBuildsFailedReport,
LastGeneratedAt: dbtime.Time(now).UTC(),
})
if err != nil {
return xerrors.Errorf("unable to update report generator logs (first time execution): %w", err)
}
return nil
}
// Secondly, check if the job has not been running recently
if !reportLog.LastGeneratedAt.IsZero() && reportLog.LastGeneratedAt.Add(failedWorkspaceBuildsReportFrequency).After(now) {
return nil // reports sent recently, no need to send them now
}
// Thirdly, fetch workspace build stats by templates
templateStatsRows, err := db.GetWorkspaceBuildStatsByTemplates(ctx, dbtime.Time(since).UTC())
if err != nil {
return xerrors.Errorf("unable to fetch failed workspace builds: %w", err)
}
for _, stats := range templateStatsRows {
select {
case <-ctx.Done():
logger.Debug(ctx, "context is canceled, quitting", slog.Error(ctx.Err()))
break
default:
}
if stats.FailedBuilds == 0 {
logger.Info(ctx, "no failed workspace builds found for template", slog.F("template_id", stats.TemplateID), slog.Error(err))
continue
}
// Fetch template admins with org access to the templates
templateAdmins, err := findTemplateAdmins(ctx, db, stats)
if err != nil {
logger.Error(ctx, "unable to find template admins for template", slog.F("template_id", stats.TemplateID), slog.Error(err))
continue
}
// Fetch failed builds by the template
failedBuilds, err := db.GetFailedWorkspaceBuildsByTemplateID(ctx, database.GetFailedWorkspaceBuildsByTemplateIDParams{
TemplateID: stats.TemplateID,
Since: dbtime.Time(since).UTC(),
})
if err != nil {
logger.Error(ctx, "unable to fetch failed workspace builds", slog.F("template_id", stats.TemplateID), slog.Error(err))
continue
}
reportData := buildDataForReportFailedWorkspaceBuilds(stats, failedBuilds)
// Send reports to template admins
templateDisplayName := stats.TemplateDisplayName
if templateDisplayName == "" {
templateDisplayName = stats.TemplateName
}
for _, templateAdmin := range templateAdmins {
select {
case <-ctx.Done():
logger.Debug(ctx, "context is canceled, quitting", slog.Error(ctx.Err()))
break
default:
}
if _, err := enqueuer.EnqueueWithData(ctx, templateAdmin.ID, notifications.TemplateWorkspaceBuildsFailedReport,
map[string]string{
"template_name": stats.TemplateName,
"template_display_name": templateDisplayName,
},
reportData,
"report_generator",
stats.TemplateID, stats.TemplateOrganizationID,
); err != nil {
logger.Warn(ctx, "failed to send a report with failed workspace builds", slog.Error(err))
}
}
}
if xerrors.Is(ctx.Err(), context.Canceled) {
logger.Error(ctx, "report generator job is canceled")
return ctx.Err()
}
// Lastly, update the timestamp in the generator log.
err = db.UpsertNotificationReportGeneratorLog(ctx, database.UpsertNotificationReportGeneratorLogParams{
NotificationTemplateID: notifications.TemplateWorkspaceBuildsFailedReport,
LastGeneratedAt: dbtime.Time(now).UTC(),
})
if err != nil {
return xerrors.Errorf("unable to update report generator logs: %w", err)
}
return nil
}
const workspaceBuildsLimitPerTemplateVersion = 10
func buildDataForReportFailedWorkspaceBuilds(stats database.GetWorkspaceBuildStatsByTemplatesRow, failedBuilds []database.GetFailedWorkspaceBuildsByTemplateIDRow) map[string]any {
// Build notification model for template versions and failed workspace builds.
//
// Failed builds are sorted by template version ascending, workspace build number descending.
// Review builds, group them by template versions, and assign to builds to template versions.
// The map requires `[]map[string]any{}` to be compatible with data passed to `NotificationEnqueuer`.
templateVersions := []map[string]any{}
for _, failedBuild := range failedBuilds {
c := len(templateVersions)
if c == 0 || templateVersions[c-1]["template_version_name"] != failedBuild.TemplateVersionName {
templateVersions = append(templateVersions, map[string]any{
"template_version_name": failedBuild.TemplateVersionName,
"failed_count": 1,
"failed_builds": []map[string]any{
{
"workspace_owner_username": failedBuild.WorkspaceOwnerUsername,
"workspace_name": failedBuild.WorkspaceName,
"build_number": failedBuild.WorkspaceBuildNumber,
},
},
})
continue
}
tv := templateVersions[c-1]
//nolint:errorlint,forcetypeassert // only this function prepares the notification model
tv["failed_count"] = tv["failed_count"].(int) + 1
//nolint:errorlint,forcetypeassert // only this function prepares the notification model
builds := tv["failed_builds"].([]map[string]any)
if len(builds) < workspaceBuildsLimitPerTemplateVersion {
// return N last builds to prevent long email reports
builds = append(builds, map[string]any{
"workspace_owner_username": failedBuild.WorkspaceOwnerUsername,
"workspace_name": failedBuild.WorkspaceName,
"build_number": failedBuild.WorkspaceBuildNumber,
})
tv["failed_builds"] = builds
}
templateVersions[c-1] = tv
}
return map[string]any{
"failed_builds": stats.FailedBuilds,
"total_builds": stats.TotalBuilds,
"report_frequency": failedWorkspaceBuildsReportFrequencyLabel,
"template_versions": templateVersions,
}
}
func findTemplateAdmins(ctx context.Context, db database.Store, stats database.GetWorkspaceBuildStatsByTemplatesRow) ([]database.GetUsersRow, error) {
users, err := db.GetUsers(ctx, database.GetUsersParams{
RbacRole: []string{codersdk.RoleTemplateAdmin},
})
if err != nil {
return nil, xerrors.Errorf("unable to fetch template admins: %w", err)
}
var templateAdmins []database.GetUsersRow
if len(users) == 0 {
return templateAdmins, nil
}
usersByIDs := map[uuid.UUID]database.GetUsersRow{}
var userIDs []uuid.UUID
for _, user := range users {
usersByIDs[user.ID] = user
userIDs = append(userIDs, user.ID)
}
orgIDsByMemberIDs, err := db.GetOrganizationIDsByMemberIDs(ctx, userIDs)
if err != nil {
return nil, xerrors.Errorf("unable to fetch organization IDs by member IDs: %w", err)
}
for _, entry := range orgIDsByMemberIDs {
if slices.Contains(entry.OrganizationIDs, stats.TemplateOrganizationID) {
templateAdmins = append(templateAdmins, usersByIDs[entry.UserID])
}
}
sort.Slice(templateAdmins, func(i, j int) bool {
return templateAdmins[i].Username < templateAdmins[j].Username
})
return templateAdmins, nil
}

View File

@ -0,0 +1,475 @@
package reports
import (
"context"
"database/sql"
"testing"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require"
"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/quartz"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/testutil"
)
const dayDuration = 24 * time.Hour
var (
jobError = sql.NullString{String: "badness", Valid: true}
jobErrorCode = sql.NullString{String: "ERR-42", Valid: true}
)
func TestReportFailedWorkspaceBuilds(t *testing.T) {
t.Parallel()
t.Run("EmptyState_NoBuilds_NoReport", func(t *testing.T) {
t.Parallel()
// Setup
ctx, logger, db, _, notifEnq, clk := setup(t)
// Database is ready, so we can clear notifications queue
notifEnq.Clear()
// When: first run
err := reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then: no report should be generated
require.NoError(t, err)
require.Empty(t, notifEnq.Sent)
// Given: one week later and no jobs were executed
clk.Advance(failedWorkspaceBuildsReportFrequency + time.Minute)
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then: report is still empty
require.NoError(t, err)
require.Empty(t, notifEnq.Sent)
})
t.Run("InitialState_NoBuilds_NoReport", func(t *testing.T) {
t.Parallel()
// Setup
ctx, logger, db, ps, notifEnq, clk := setup(t)
now := clk.Now()
// Organization
org := dbgen.Organization(t, db, database.Organization{})
// Template admins
templateAdmin1 := dbgen.User(t, db, database.User{Username: "template-admin-1", RBACRoles: []string{rbac.RoleTemplateAdmin().Name}})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: templateAdmin1.ID, OrganizationID: org.ID})
// Regular users
user1 := dbgen.User(t, db, database.User{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user1.ID, OrganizationID: org.ID})
user2 := dbgen.User(t, db, database.User{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user2.ID, OrganizationID: org.ID})
// Templates
t1 := dbgen.Template(t, db, database.Template{Name: "template-1", DisplayName: "First Template", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID})
// Template versions
t1v1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-1-version-1", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t1.ID, Valid: true}, JobID: uuid.New()})
// Workspaces
w1 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t1.ID, OwnerID: user1.ID, OrganizationID: org.ID})
w1wb1pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-6 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 1, TemplateVersionID: t1v1.ID, JobID: w1wb1pj.ID, CreatedAt: now.Add(-2 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
// When: first run
notifEnq.Clear()
err := reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then: failed builds should not be reported
require.NoError(t, err)
require.Empty(t, notifEnq.Sent)
// Given: one week later, but still no jobs
clk.Advance(failedWorkspaceBuildsReportFrequency + time.Minute)
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then: report is still empty
require.NoError(t, err)
require.Empty(t, notifEnq.Sent)
})
t.Run("FailedBuilds_SecondRun_Report_ThirdRunTooEarly_NoReport_FourthRun_Report", func(t *testing.T) {
t.Parallel()
verifyNotification := func(t *testing.T, recipient database.User, notif *testutil.Notification, tmpl database.Template, failedBuilds, totalBuilds int64, templateVersions []map[string]interface{}) {
t.Helper()
require.Equal(t, recipient.ID, notif.UserID)
require.Equal(t, notifications.TemplateWorkspaceBuildsFailedReport, notif.TemplateID)
require.Equal(t, tmpl.Name, notif.Labels["template_name"])
require.Equal(t, tmpl.DisplayName, notif.Labels["template_display_name"])
require.Equal(t, failedBuilds, notif.Data["failed_builds"])
require.Equal(t, totalBuilds, notif.Data["total_builds"])
require.Equal(t, "week", notif.Data["report_frequency"])
require.Equal(t, templateVersions, notif.Data["template_versions"])
}
// Setup
ctx, logger, db, ps, notifEnq, clk := setup(t)
// Given
// Organization
org := dbgen.Organization(t, db, database.Organization{})
// Template admins
templateAdmin1 := dbgen.User(t, db, database.User{Username: "template-admin-1", RBACRoles: []string{rbac.RoleTemplateAdmin().Name}})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: templateAdmin1.ID, OrganizationID: org.ID})
templateAdmin2 := dbgen.User(t, db, database.User{Username: "template-admin-2", RBACRoles: []string{rbac.RoleTemplateAdmin().Name}})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: templateAdmin2.ID, OrganizationID: org.ID})
_ = dbgen.User(t, db, database.User{Name: "template-admin-3", RBACRoles: []string{rbac.RoleTemplateAdmin().Name}})
// template admin in some other org, they should not receive any notification
// Regular users
user1 := dbgen.User(t, db, database.User{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user1.ID, OrganizationID: org.ID})
user2 := dbgen.User(t, db, database.User{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user2.ID, OrganizationID: org.ID})
// Templates
t1 := dbgen.Template(t, db, database.Template{Name: "template-1", DisplayName: "First Template", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID})
t2 := dbgen.Template(t, db, database.Template{Name: "template-2", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID})
// Template versions
t1v1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-1-version-1", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t1.ID, Valid: true}, JobID: uuid.New()})
t1v2 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-1-version-2", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t1.ID, Valid: true}, JobID: uuid.New()})
t2v1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-2-version-1", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t2.ID, Valid: true}, JobID: uuid.New()})
t2v2 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-2-version-2", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t2.ID, Valid: true}, JobID: uuid.New()})
// Workspaces
w1 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t1.ID, OwnerID: user1.ID, OrganizationID: org.ID})
w2 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t2.ID, OwnerID: user2.ID, OrganizationID: org.ID})
w3 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t1.ID, OwnerID: user1.ID, OrganizationID: org.ID})
w4 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t2.ID, OwnerID: user2.ID, OrganizationID: org.ID})
// When: first run
notifEnq.Clear()
err := reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then
require.NoError(t, err)
require.Empty(t, notifEnq.Sent) // no notifications
// One week later...
clk.Advance(failedWorkspaceBuildsReportFrequency + time.Minute)
now := clk.Now()
// Workspace builds
w1wb1pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-6 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 1, TemplateVersionID: t1v1.ID, JobID: w1wb1pj.ID, CreatedAt: now.Add(-6 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w1wb2pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, CompletedAt: sql.NullTime{Time: now.Add(-5 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 2, TemplateVersionID: t1v2.ID, JobID: w1wb2pj.ID, CreatedAt: now.Add(-5 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w1wb3pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-4 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 3, TemplateVersionID: t1v2.ID, JobID: w1wb3pj.ID, CreatedAt: now.Add(-4 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w2wb1pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, CompletedAt: sql.NullTime{Time: now.Add(-5 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w2.ID, BuildNumber: 4, TemplateVersionID: t2v1.ID, JobID: w2wb1pj.ID, CreatedAt: now.Add(-5 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w2wb2pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-4 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w2.ID, BuildNumber: 5, TemplateVersionID: t2v2.ID, JobID: w2wb2pj.ID, CreatedAt: now.Add(-4 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w2wb3pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-3 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w2.ID, BuildNumber: 6, TemplateVersionID: t2v2.ID, JobID: w2wb3pj.ID, CreatedAt: now.Add(-3 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w3wb1pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-3 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w3.ID, BuildNumber: 7, TemplateVersionID: t1v1.ID, JobID: w3wb1pj.ID, CreatedAt: now.Add(-3 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w4wb1pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-6 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w4.ID, BuildNumber: 8, TemplateVersionID: t2v1.ID, JobID: w4wb1pj.ID, CreatedAt: now.Add(-6 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w4wb2pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, CompletedAt: sql.NullTime{Time: now.Add(-dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w4.ID, BuildNumber: 9, TemplateVersionID: t2v2.ID, JobID: w4wb2pj.ID, CreatedAt: now.Add(-dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, authedDB(t, db, logger), notifEnq, clk)
// Then
require.NoError(t, err)
require.Len(t, notifEnq.Sent, 4) // 2 templates, 2 template admins
for i, templateAdmin := range []database.User{templateAdmin1, templateAdmin2} {
verifyNotification(t, templateAdmin, notifEnq.Sent[i], t1, 3, 4, []map[string]interface{}{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(7), "workspace_name": w3.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(1), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 2,
"template_version_name": t1v1.Name,
},
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(3), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 1,
"template_version_name": t1v2.Name,
},
})
}
for i, templateAdmin := range []database.User{templateAdmin1, templateAdmin2} {
verifyNotification(t, templateAdmin, notifEnq.Sent[i+2], t2, 3, 5, []map[string]interface{}{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(8), "workspace_name": w4.Name, "workspace_owner_username": user2.Username},
},
"failed_count": 1,
"template_version_name": t2v1.Name,
},
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(6), "workspace_name": w2.Name, "workspace_owner_username": user2.Username},
{"build_number": int32(5), "workspace_name": w2.Name, "workspace_owner_username": user2.Username},
},
"failed_count": 2,
"template_version_name": t2v2.Name,
},
})
}
// Given: 6 days later (less than report frequency), and failed build
clk.Advance(6 * dayDuration).MustWait(context.Background())
now = clk.Now()
w1wb4pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: now.Add(-dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 77, TemplateVersionID: t1v2.ID, JobID: w1wb4pj.ID, CreatedAt: now.Add(-dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, authedDB(t, db, logger), notifEnq, clk)
require.NoError(t, err)
// Then: no notifications as it is too early
require.Empty(t, notifEnq.Sent)
// Given: 1 day 1 hour later
clk.Advance(dayDuration + time.Hour).MustWait(context.Background())
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, authedDB(t, db, logger), notifEnq, clk)
require.NoError(t, err)
// Then: we should see the failed job in the report
require.Len(t, notifEnq.Sent, 2) // a new failed job should be reported
for i, templateAdmin := range []database.User{templateAdmin1, templateAdmin2} {
verifyNotification(t, templateAdmin, notifEnq.Sent[i], t1, 1, 1, []map[string]interface{}{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(77), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 1,
"template_version_name": t1v2.Name,
},
})
}
})
t.Run("TooManyFailedBuilds_SecondRun_Report", func(t *testing.T) {
t.Parallel()
verifyNotification := func(t *testing.T, recipient database.User, notif *testutil.Notification, tmpl database.Template, failedBuilds, totalBuilds int64, templateVersions []map[string]interface{}) {
t.Helper()
require.Equal(t, recipient.ID, notif.UserID)
require.Equal(t, notifications.TemplateWorkspaceBuildsFailedReport, notif.TemplateID)
require.Equal(t, tmpl.Name, notif.Labels["template_name"])
require.Equal(t, tmpl.DisplayName, notif.Labels["template_display_name"])
require.Equal(t, failedBuilds, notif.Data["failed_builds"])
require.Equal(t, totalBuilds, notif.Data["total_builds"])
require.Equal(t, "week", notif.Data["report_frequency"])
require.Equal(t, templateVersions, notif.Data["template_versions"])
}
// Setup
ctx, logger, db, ps, notifEnq, clk := setup(t)
// Given
// Organization
org := dbgen.Organization(t, db, database.Organization{})
// Template admins
templateAdmin1 := dbgen.User(t, db, database.User{Username: "template-admin-1", RBACRoles: []string{rbac.RoleTemplateAdmin().Name}})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: templateAdmin1.ID, OrganizationID: org.ID})
// Regular users
user1 := dbgen.User(t, db, database.User{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user1.ID, OrganizationID: org.ID})
// Templates
t1 := dbgen.Template(t, db, database.Template{Name: "template-1", DisplayName: "First Template", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID})
// Template versions
t1v1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-1-version-1", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t1.ID, Valid: true}, JobID: uuid.New()})
t1v2 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-1-version-2", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t1.ID, Valid: true}, JobID: uuid.New()})
// Workspaces
w1 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t1.ID, OwnerID: user1.ID, OrganizationID: org.ID})
// When: first run
notifEnq.Clear()
err := reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then
require.NoError(t, err)
require.Empty(t, notifEnq.Sent) // no notifications
// One week later...
clk.Advance(failedWorkspaceBuildsReportFrequency + time.Minute)
now := clk.Now()
// Workspace builds
pj0 := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, CompletedAt: sql.NullTime{Time: now.Add(-24 * time.Hour), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 777, TemplateVersionID: t1v1.ID, JobID: pj0.ID, CreatedAt: now.Add(-24 * time.Hour), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
for i := 1; i <= 23; i++ {
at := now.Add(-time.Duration(i) * time.Hour)
pj1 := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: at, Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: int32(i), TemplateVersionID: t1v1.ID, JobID: pj1.ID, CreatedAt: at, Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
pj2 := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, Error: jobError, ErrorCode: jobErrorCode, CompletedAt: sql.NullTime{Time: at, Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: int32(i) + 100, TemplateVersionID: t1v2.ID, JobID: pj2.ID, CreatedAt: at, Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
}
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, authedDB(t, db, logger), notifEnq, clk)
// Then
require.NoError(t, err)
require.Len(t, notifEnq.Sent, 1) // 1 template, 1 template admin
verifyNotification(t, templateAdmin1, notifEnq.Sent[0], t1, 46, 47, []map[string]interface{}{
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(23), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(22), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(21), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(20), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(19), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(18), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(17), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(16), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(15), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(14), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 23,
"template_version_name": t1v1.Name,
},
{
"failed_builds": []map[string]interface{}{
{"build_number": int32(123), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(122), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(121), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(120), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(119), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(118), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(117), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(116), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(115), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
{"build_number": int32(114), "workspace_name": w1.Name, "workspace_owner_username": user1.Username},
},
"failed_count": 23,
"template_version_name": t1v2.Name,
},
})
})
t.Run("NoFailedBuilds_NoReport", func(t *testing.T) {
t.Parallel()
// Setup
ctx, logger, db, ps, notifEnq, clk := setup(t)
// Given
// Organization
org := dbgen.Organization(t, db, database.Organization{})
// Template admins
templateAdmin1 := dbgen.User(t, db, database.User{Username: "template-admin-1", RBACRoles: []string{rbac.RoleTemplateAdmin().Name}})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: templateAdmin1.ID, OrganizationID: org.ID})
// Regular users
user1 := dbgen.User(t, db, database.User{})
_ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user1.ID, OrganizationID: org.ID})
// Templates
t1 := dbgen.Template(t, db, database.Template{Name: "template-1", DisplayName: "First Template", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID})
// Template versions
t1v1 := dbgen.TemplateVersion(t, db, database.TemplateVersion{Name: "template-1-version-1", CreatedBy: templateAdmin1.ID, OrganizationID: org.ID, TemplateID: uuid.NullUUID{UUID: t1.ID, Valid: true}, JobID: uuid.New()})
// Workspaces
w1 := dbgen.Workspace(t, db, database.Workspace{TemplateID: t1.ID, OwnerID: user1.ID, OrganizationID: org.ID})
// When: first run
notifEnq.Clear()
err := reportFailedWorkspaceBuilds(ctx, logger, db, notifEnq, clk)
// Then: no notifications
require.NoError(t, err)
require.Empty(t, notifEnq.Sent)
// Given: one week later, and a successful few jobs being executed
clk.Advance(failedWorkspaceBuildsReportFrequency + time.Minute)
now := clk.Now()
// Workspace builds
w1wb1pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, CompletedAt: sql.NullTime{Time: now.Add(-6 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 1, TemplateVersionID: t1v1.ID, JobID: w1wb1pj.ID, CreatedAt: now.Add(-2 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
w1wb2pj := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{OrganizationID: org.ID, CompletedAt: sql.NullTime{Time: now.Add(-5 * dayDuration), Valid: true}})
_ = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{WorkspaceID: w1.ID, BuildNumber: 2, TemplateVersionID: t1v1.ID, JobID: w1wb2pj.ID, CreatedAt: now.Add(-1 * dayDuration), Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator})
// When
notifEnq.Clear()
err = reportFailedWorkspaceBuilds(ctx, logger, authedDB(t, db, logger), notifEnq, clk)
// Then: no failures? nothing to report
require.NoError(t, err)
require.Len(t, notifEnq.Sent, 0) // all jobs succeeded so nothing to report
})
}
func setup(t *testing.T) (context.Context, slog.Logger, database.Store, pubsub.Pubsub, *testutil.FakeNotificationsEnqueuer, *quartz.Mock) {
t.Helper()
// nolint:gocritic // reportFailedWorkspaceBuilds is called by system.
ctx := dbauthz.AsSystemRestricted(context.Background())
logger := slogtest.Make(t, &slogtest.Options{})
db, ps := dbtestutil.NewDB(t)
notifyEnq := &testutil.FakeNotificationsEnqueuer{}
clk := quartz.NewMock(t)
return ctx, logger, db, ps, notifyEnq, clk
}
func authedDB(t *testing.T, db database.Store, logger slog.Logger) database.Store {
t.Helper()
return dbauthz.New(db, rbac.NewAuthorizer(prometheus.NewRegistry()), logger, coderdtest.AccessControlStorePointer())
}