fix: Prevent autobuild executor from slowing down API requests (#3726)

With just a few workspaces, the autobuild executor can slow down API
requests every time it runs. This is because we started a long running
transaction and checked all eligible (for autostart) workspaces inside
that transaction. PostgreSQL doesn't know if we're modifying rows and as
such is locking the tables for read operations.

This commit changes the behavior so each workspace is checked in its own
transaction reducing the time the table/rows needs to stay locked.

For now concurrency has been arbitrarily limited to 10 workspaces at a
time, this could be made configurable or adjusted as the need arises.
This commit is contained in:
Mathias Fredriksson
2022-09-02 13:24:47 +03:00
committed by GitHub
parent 3f73243b37
commit 4c18034260
5 changed files with 112 additions and 154 deletions

View File

@ -5,14 +5,14 @@ import (
"encoding/json"
"time"
"cdr.dev/slog"
"github.com/coder/coder/coderd/autobuild/schedule"
"github.com/coder/coder/coderd/database"
"github.com/google/uuid"
"github.com/moby/moby/pkg/namesgenerator"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/coderd/autobuild/schedule"
"github.com/coder/coder/coderd/database"
)
// Executor automatically starts or stops workspaces.
@ -89,80 +89,116 @@ func (e *Executor) runOnce(t time.Time) Stats {
stats.Error = err
}()
currentTick := t.Truncate(time.Minute)
err = e.db.InTx(func(db database.Store) error {
// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
eligibleWorkspaces, err := db.GetWorkspacesAutostart(e.ctx)
if err != nil {
return xerrors.Errorf("get eligible workspaces for autostart or autostop: %w", err)
}
for _, ws := range eligibleWorkspaces {
// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
e.log.Warn(e.ctx, "get latest workspace build",
slog.F("workspace_id", ws.ID),
slog.Error(err),
)
continue
}
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
if err != nil {
e.log.Warn(e.ctx, "get last provisioner job for workspace %q: %w",
slog.F("workspace_id", ws.ID),
slog.Error(err),
)
continue
}
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
if err != nil {
e.log.Debug(e.ctx, "skipping workspace",
slog.Error(err),
slog.F("workspace_id", ws.ID),
)
continue
}
if currentTick.Before(nextTransition) {
e.log.Debug(e.ctx, "skipping workspace: too early",
slog.F("workspace_id", ws.ID),
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
continue
}
e.log.Info(e.ctx, "scheduling workspace transition",
slog.F("workspace_id", ws.ID),
slog.F("transition", validTransition),
)
stats.Transitions[ws.ID] = validTransition
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
e.log.Error(e.ctx, "unable to transition workspace",
slog.F("workspace_id", ws.ID),
slog.F("transition", validTransition),
slog.Error(err),
)
}
}
return nil
// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
workspaces, err := e.db.GetWorkspaces(e.ctx, database.GetWorkspacesParams{
Deleted: false,
})
if err != nil {
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
return stats
}
var eligibleWorkspaceIDs []uuid.UUID
for _, ws := range workspaces {
if isEligibleForAutoStartStop(ws) {
eligibleWorkspaceIDs = append(eligibleWorkspaceIDs, ws.ID)
}
}
// We only use errgroup here for convenience of API, not for early
// cancellation. This means we only return nil errors in th eg.Go.
eg := errgroup.Group{}
// Limit the concurrency to avoid overloading the database.
eg.SetLimit(10)
for _, wsID := range eligibleWorkspaceIDs {
wsID := wsID
log := e.log.With(slog.F("workspace_id", wsID))
eg.Go(func() error {
err := e.db.InTx(func(db database.Store) error {
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
return nil
}
if !isEligibleForAutoStartStop(ws) {
return nil
}
// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
}
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
}
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
return nil
}
if currentTick.Before(nextTransition) {
log.Debug(e.ctx, "skipping workspace: too early",
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
return nil
}
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
stats.Transitions[ws.ID] = validTransition
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", validTransition),
slog.Error(err),
)
return nil
}
return nil
})
if err != nil {
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
}
return nil
})
}
// This should not happen since we don't want early cancellation.
err = eg.Wait()
if err != nil {
e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
}
return stats
}
func isEligibleForAutoStartStop(ws database.Workspace) bool {
return !ws.Deleted && (ws.AutostartSchedule.String != "" || ws.Ttl.Int64 > 0)
}
func getNextTransition(
ws database.Workspace,
priorHistory database.WorkspaceBuild,