fix(agent): ensure SSH server shutdown with process groups (#17227)

Fix hanging workspace shutdowns caused by orphaned SSH child processes.
Key changes:

- Create process groups for non-PTY SSH sessions
- Send SIGHUP to entire process group for proper termination
- Add 5-second timeout to prevent indefinite blocking

Fixes #17108
This commit is contained in:
Mathias Fredriksson
2025-04-03 16:01:43 +03:00
committed by GitHub
parent b60934b180
commit b61f0ab958
5 changed files with 195 additions and 61 deletions

View File

@ -1773,15 +1773,22 @@ func (a *agent) Close() error {
a.setLifecycle(codersdk.WorkspaceAgentLifecycleShuttingDown)
// Attempt to gracefully shut down all active SSH connections and
// stop accepting new ones.
err := a.sshServer.Shutdown(a.hardCtx)
// stop accepting new ones. If all processes have not exited after 5
// seconds, we just log it and move on as it's more important to run
// the shutdown scripts. A typical shutdown time for containers is
// 10 seconds, so this still leaves a bit of time to run the
// shutdown scripts in the worst-case.
sshShutdownCtx, sshShutdownCancel := context.WithTimeout(a.hardCtx, 5*time.Second)
defer sshShutdownCancel()
err := a.sshServer.Shutdown(sshShutdownCtx)
if err != nil {
a.logger.Error(a.hardCtx, "ssh server shutdown", slog.Error(err))
}
err = a.sshServer.Close()
if err != nil {
a.logger.Error(a.hardCtx, "ssh server close", slog.Error(err))
if errors.Is(err, context.DeadlineExceeded) {
a.logger.Warn(sshShutdownCtx, "ssh server shutdown timeout", slog.Error(err))
} else {
a.logger.Error(sshShutdownCtx, "ssh server shutdown", slog.Error(err))
}
}
// wait for SSH to shut down before the general graceful cancel, because
// this triggers a disconnect in the tailnet layer, telling all clients to
// shut down their wireguard tunnels to us. If SSH sessions are still up,