diff --git a/.vscode/settings.json b/.vscode/settings.json index ca19998fa6..b63792e247 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -33,6 +33,7 @@ "enablements", "errgroup", "eventsourcemock", + "Failf", "fatih", "Formik", "gitauth", @@ -160,10 +161,7 @@ "xstate", "yamux" ], - "cSpell.ignorePaths": [ - "site/package.json", - ".vscode/settings.json" - ], + "cSpell.ignorePaths": ["site/package.json", ".vscode/settings.json"], "emeraldwalk.runonsave": { "commands": [ { @@ -195,10 +193,7 @@ // To reduce redundancy in tests, it's covered by other packages. // Since package coverage pairing can't be defined, all packages cover // all other packages. - "go.testFlags": [ - "-short", - "-coverpkg=./..." - ], + "go.testFlags": ["-short", "-coverpkg=./..."], // We often use a version of TypeScript that's ahead of the version shipped // with VS Code. "typescript.tsdk": "./site/node_modules/typescript/lib" diff --git a/coderd/provisionerdserver/provisionerdserver.go b/coderd/provisionerdserver/provisionerdserver.go index 240d042ee5..e14d6bbc3d 100644 --- a/coderd/provisionerdserver/provisionerdserver.go +++ b/coderd/provisionerdserver/provisionerdserver.go @@ -14,6 +14,8 @@ import ( "github.com/google/uuid" "github.com/tabbed/pqtype" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" "golang.org/x/xerrors" protobuf "google.golang.org/protobuf/proto" @@ -631,14 +633,58 @@ func (server *Server) CompleteJob(ctx context.Context, completed *proto.Complete if err != nil { return xerrors.Errorf("update workspace build: %w", err) } + + agentTimeouts := make(map[time.Duration]bool) // A set of agent timeouts. // This could be a bulk insert to improve performance. for _, protoResource := range jobType.WorkspaceBuild.Resources { + for _, protoAgent := range protoResource.Agents { + dur := time.Duration(protoAgent.GetConnectionTimeoutSeconds()) * time.Second + agentTimeouts[dur] = true + } err = InsertWorkspaceResource(ctx, db, job.ID, workspaceBuild.Transition, protoResource, telemetrySnapshot) if err != nil { return xerrors.Errorf("insert provisioner job: %w", err) } } + // On start, we want to ensure that workspace agents timeout statuses + // are propagated. This method is simple and does not protect against + // notifying in edge cases like when a workspace is stopped soon + // after being started. + // + // Agent timeouts could be minutes apart, resulting in an unresponsive + // experience, so we'll notify after every unique timeout seconds. + if !input.DryRun && workspaceBuild.Transition == database.WorkspaceTransitionStart && len(agentTimeouts) > 0 { + timeouts := maps.Keys(agentTimeouts) + slices.Sort(timeouts) + + var updates []<-chan time.Time + for _, d := range timeouts { + server.Logger.Debug(ctx, "triggering workspace notification after agent timeout", + slog.F("workspace_build_id", workspaceBuild.ID), + slog.F("timeout", d), + ) + // Agents are inserted with `database.Now()`, this triggers a + // workspace event approximately after created + timeout seconds. + updates = append(updates, time.After(d)) + } + go func() { + for _, wait := range updates { + // Wait for the next potential timeout to occur. Note that we + // can't listen on the context here because we will hang around + // after this function has returned. The server also doesn't + // have a shutdown signal we can listen to. + <-wait + if err := server.Pubsub.Publish(codersdk.WorkspaceNotifyChannel(workspaceBuild.WorkspaceID), []byte{}); err != nil { + server.Logger.Error(ctx, "workspace notification after agent timeout failed", + slog.F("workspace_build_id", workspaceBuild.ID), + slog.Error(err), + ) + } + } + }() + } + if workspaceBuild.Transition != database.WorkspaceTransitionDelete { // This is for deleting a workspace! return nil diff --git a/coderd/workspaces_test.go b/coderd/workspaces_test.go index 1711de1ba3..1c9c9fb991 100644 --- a/coderd/workspaces_test.go +++ b/coderd/workspaces_test.go @@ -1357,6 +1357,7 @@ func TestWorkspaceWatcher(t *testing.T) { Auth: &proto.Agent_Token{ Token: authToken, }, + ConnectionTimeoutSeconds: 1, }}, }}, }, @@ -1372,58 +1373,64 @@ func TestWorkspaceWatcher(t *testing.T) { wc, err := client.WatchWorkspace(ctx, workspace.ID) require.NoError(t, err) - wait := func() { + + // Wait events are easier to debug with timestamped logs. + logger := slogtest.Make(t, nil).Named(t.Name()).Leveled(slog.LevelDebug) + wait := func(event string) { select { case <-ctx.Done(): - t.Fail() + require.FailNow(t, "timed out waiting for event", event) case <-wc: + logger.Info(ctx, "done waiting for event", slog.F("event", event)) } } coderdtest.CreateWorkspaceBuild(t, client, workspace, database.WorkspaceTransitionStart) - // the workspace build being created - wait() - // the workspace build being acquired - wait() - // the workspace build completing - wait() + wait("workspace build being created") + wait("workspace build being acquired") + wait("workspace build completing") + + // Unfortunately, this will add ~1s to the test due to the granularity + // of agent timeout seconds. However, if we don't do this we won't know + // which trigger we received when waiting for connection. + // + // Note that the first timeout is from `coderdtest.CreateWorkspace` and + // the latter is from `coderdtest.CreateWorkspaceBuild`. + wait("agent timeout after create") + wait("agent timeout after start") agentClient := codersdk.New(client.URL) agentClient.SetSessionToken(authToken) agentCloser := agent.New(agent.Options{ Client: agentClient, - Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug), + Logger: logger.Named("agent"), }) defer func() { _ = agentCloser.Close() }() - // the agent connected - wait() + wait("agent connected") agentCloser.Close() - // the agent disconnected - wait() + wait("agent disconnected") closeFunc.Close() build := coderdtest.CreateWorkspaceBuild(t, client, workspace, database.WorkspaceTransitionStart) - // First is for the workspace build itself - wait() + wait("first is for the workspace build itself") err = client.CancelWorkspaceBuild(ctx, build.ID) require.NoError(t, err) - // Second is for the build cancel - wait() + wait("second is for the build cancel") err = client.UpdateWorkspace(ctx, workspace.ID, codersdk.UpdateWorkspaceRequest{ Name: "another", }) require.NoError(t, err) - wait() + wait("update workspace name") err = client.UpdateActiveTemplateVersion(ctx, template.ID, codersdk.UpdateActiveTemplateVersion{ ID: template.ActiveVersionID, }) require.NoError(t, err) - wait() + wait("update active template version") cancel() }