fix(agent/agentssh): use deterministic host key for SSH server (#16626)

Fixes: https://github.com/coder/coder/issues/16490

The Agent's SSH server now initially generates fixed host keys and, once it receives its manifest, generates and replaces that host key with the one derived from the workspace ID, ensuring consistency across agent restarts. This prevents SSH warnings and host key verification errors when connecting to workspaces through Coder Desktop.

While deterministic keys might seem insecure, the underlying Wireguard tunnel already provides encryption and anti-spoofing protection at the network layer, making this approach acceptable for our use case.

---
Change-Id: I8c7e3070324e5d558374fd6891eea9d48660e1e9
Signed-off-by: Thomas Kosiewski <tk@coder.com>
This commit is contained in:
Thomas Kosiewski
2025-02-21 14:58:41 +01:00
committed by GitHub
parent e8a7b7e8cb
commit 660746462e
6 changed files with 226 additions and 17 deletions

View File

@ -6,6 +6,7 @@ import (
"encoding/json"
"errors"
"fmt"
"hash/fnv"
"io"
"net/http"
"net/netip"
@ -994,7 +995,6 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
if err := manifestOK.wait(ctx); err != nil {
return xerrors.Errorf("no manifest: %w", err)
}
var err error
defer func() {
networkOK.complete(retErr)
}()
@ -1003,9 +1003,20 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
network := a.network
a.closeMutex.Unlock()
if network == nil {
keySeed, err := WorkspaceKeySeed(manifest.WorkspaceID, manifest.AgentName)
if err != nil {
return xerrors.Errorf("generate seed from workspace id: %w", err)
}
// use the graceful context here, because creating the tailnet is not itself tied to the
// agent API.
network, err = a.createTailnet(a.gracefulCtx, manifest.AgentID, manifest.DERPMap, manifest.DERPForceWebSockets, manifest.DisableDirectConnections)
network, err = a.createTailnet(
a.gracefulCtx,
manifest.AgentID,
manifest.DERPMap,
manifest.DERPForceWebSockets,
manifest.DisableDirectConnections,
keySeed,
)
if err != nil {
return xerrors.Errorf("create tailnet: %w", err)
}
@ -1145,7 +1156,13 @@ func (a *agent) trackGoroutine(fn func()) error {
return nil
}
func (a *agent) createTailnet(ctx context.Context, agentID uuid.UUID, derpMap *tailcfg.DERPMap, derpForceWebSockets, disableDirectConnections bool) (_ *tailnet.Conn, err error) {
func (a *agent) createTailnet(
ctx context.Context,
agentID uuid.UUID,
derpMap *tailcfg.DERPMap,
derpForceWebSockets, disableDirectConnections bool,
keySeed int64,
) (_ *tailnet.Conn, err error) {
// Inject `CODER_AGENT_HEADER` into the DERP header.
var header http.Header
if client, ok := a.client.(*agentsdk.Client); ok {
@ -1172,6 +1189,10 @@ func (a *agent) createTailnet(ctx context.Context, agentID uuid.UUID, derpMap *t
}
}()
if err := a.sshServer.UpdateHostSigner(keySeed); err != nil {
return nil, xerrors.Errorf("update host signer: %w", err)
}
sshListener, err := network.Listen("tcp", ":"+strconv.Itoa(workspacesdk.AgentSSHPort))
if err != nil {
return nil, xerrors.Errorf("listen on the ssh port: %w", err)
@ -1849,3 +1870,20 @@ func PrometheusMetricsHandler(prometheusRegistry *prometheus.Registry, logger sl
}
})
}
// WorkspaceKeySeed converts a WorkspaceID UUID and agent name to an int64 hash.
// This uses the FNV-1a hash algorithm which provides decent distribution and collision
// resistance for string inputs.
func WorkspaceKeySeed(workspaceID uuid.UUID, agentName string) (int64, error) {
h := fnv.New64a()
_, err := h.Write(workspaceID[:])
if err != nil {
return 42, err
}
_, err = h.Write([]byte(agentName))
if err != nil {
return 42, err
}
return int64(h.Sum64()), nil
}

View File

@ -3,11 +3,12 @@ package agentssh
import (
"bufio"
"context"
"crypto/rand"
"crypto/rsa"
"errors"
"fmt"
"io"
"math/big"
"math/rand"
"net"
"os"
"os/exec"
@ -128,17 +129,6 @@ type Server struct {
}
func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prometheus.Registry, fs afero.Fs, execer agentexec.Execer, config *Config) (*Server, error) {
// Clients' should ignore the host key when connecting.
// The agent needs to authenticate with coderd to SSH,
// so SSH authentication doesn't improve security.
randomHostKey, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
return nil, err
}
randomSigner, err := gossh.NewSignerFromKey(randomHostKey)
if err != nil {
return nil, err
}
if config == nil {
config = &Config{}
}
@ -206,7 +196,9 @@ func NewServer(ctx context.Context, logger slog.Logger, prometheusRegistry *prom
slog.Error(err))
},
Handler: s.sessionHandler,
HostSigners: []ssh.Signer{randomSigner},
// HostSigners are intentionally empty, as the host key will
// be set before we start listening.
HostSigners: []ssh.Signer{},
LocalPortForwardingCallback: func(ctx ssh.Context, destinationHost string, destinationPort uint32) bool {
// Allow local port forwarding all!
s.logger.Debug(ctx, "local port forward",
@ -844,7 +836,13 @@ func (s *Server) CreateCommand(ctx context.Context, script string, env []string,
return cmd, nil
}
// Serve starts the server to handle incoming connections on the provided listener.
// It returns an error if no host keys are set or if there is an issue accepting connections.
func (s *Server) Serve(l net.Listener) (retErr error) {
if len(s.srv.HostSigners) == 0 {
return xerrors.New("no host keys set")
}
s.logger.Info(context.Background(), "started serving listener", slog.F("listen_addr", l.Addr()))
defer func() {
s.logger.Info(context.Background(), "stopped serving listener",
@ -1099,3 +1097,99 @@ func userHomeDir() (string, error) {
}
return u.HomeDir, nil
}
// UpdateHostSigner updates the host signer with a new key generated from the provided seed.
// If an existing host key exists with the same algorithm, it is overwritten
func (s *Server) UpdateHostSigner(seed int64) error {
key, err := CoderSigner(seed)
if err != nil {
return err
}
s.mu.Lock()
defer s.mu.Unlock()
s.srv.AddHostKey(key)
return nil
}
// CoderSigner generates a deterministic SSH signer based on the provided seed.
// It uses RSA with a key size of 2048 bits.
func CoderSigner(seed int64) (gossh.Signer, error) {
// Clients should ignore the host key when connecting.
// The agent needs to authenticate with coderd to SSH,
// so SSH authentication doesn't improve security.
// Since the standard lib purposefully does not generate
// deterministic rsa keys, we need to do it ourselves.
coderHostKey := func() *rsa.PrivateKey {
// Create deterministic random source
// nolint: gosec
deterministicRand := rand.New(rand.NewSource(seed))
// Use fixed values for p and q based on the seed
p := big.NewInt(0)
q := big.NewInt(0)
e := big.NewInt(65537) // Standard RSA public exponent
// Generate deterministic primes using the seeded random
// Each prime should be ~1024 bits to get a 2048-bit key
for {
p.SetBit(p, 1024, 1) // Ensure it's large enough
for i := 0; i < 1024; i++ {
if deterministicRand.Int63()%2 == 1 {
p.SetBit(p, i, 1)
} else {
p.SetBit(p, i, 0)
}
}
if p.ProbablyPrime(20) {
break
}
}
for {
q.SetBit(q, 1024, 1) // Ensure it's large enough
for i := 0; i < 1024; i++ {
if deterministicRand.Int63()%2 == 1 {
q.SetBit(q, i, 1)
} else {
q.SetBit(q, i, 0)
}
}
if q.ProbablyPrime(20) && p.Cmp(q) != 0 {
break
}
}
// Calculate n = p * q
n := new(big.Int).Mul(p, q)
// Calculate phi = (p-1) * (q-1)
p1 := new(big.Int).Sub(p, big.NewInt(1))
q1 := new(big.Int).Sub(q, big.NewInt(1))
phi := new(big.Int).Mul(p1, q1)
// Calculate private exponent d
d := new(big.Int).ModInverse(e, phi)
// Create the private key
privateKey := &rsa.PrivateKey{
PublicKey: rsa.PublicKey{
N: n,
E: int(e.Int64()),
},
D: d,
Primes: []*big.Int{p, q},
}
// Compute precomputed values
privateKey.Precompute()
return privateKey
}()
coderSigner, err := gossh.NewSignerFromKey(coderHostKey)
return coderSigner, err
}

View File

@ -39,6 +39,8 @@ func Test_sessionStart_orphan(t *testing.T) {
s, err := NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), agentexec.DefaultExecer, nil)
require.NoError(t, err)
defer s.Close()
err = s.UpdateHostSigner(42)
assert.NoError(t, err)
// Here we're going to call the handler directly with a faked SSH session
// that just uses io.Pipes instead of a network socket. There is a large

View File

@ -41,6 +41,8 @@ func TestNewServer_ServeClient(t *testing.T) {
s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), agentexec.DefaultExecer, nil)
require.NoError(t, err)
defer s.Close()
err = s.UpdateHostSigner(42)
assert.NoError(t, err)
ln, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)
@ -146,6 +148,8 @@ func TestNewServer_CloseActiveConnections(t *testing.T) {
s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), agentexec.DefaultExecer, nil)
require.NoError(t, err)
defer s.Close()
err = s.UpdateHostSigner(42)
assert.NoError(t, err)
ln, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)
@ -197,6 +201,8 @@ func TestNewServer_Signal(t *testing.T) {
s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), agentexec.DefaultExecer, nil)
require.NoError(t, err)
defer s.Close()
err = s.UpdateHostSigner(42)
assert.NoError(t, err)
ln, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)
@ -262,6 +268,8 @@ func TestNewServer_Signal(t *testing.T) {
s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), afero.NewMemMapFs(), agentexec.DefaultExecer, nil)
require.NoError(t, err)
defer s.Close()
err = s.UpdateHostSigner(42)
assert.NoError(t, err)
ln, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)

View File

@ -38,6 +38,8 @@ func TestServer_X11(t *testing.T) {
s, err := agentssh.NewServer(ctx, logger, prometheus.NewRegistry(), fs, agentexec.DefaultExecer, &agentssh.Config{})
require.NoError(t, err)
defer s.Close()
err = s.UpdateHostSigner(42)
assert.NoError(t, err)
ln, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)

View File

@ -453,6 +453,71 @@ func TestSSH(t *testing.T) {
<-cmdDone
})
t.Run("DeterministicHostKey", func(t *testing.T) {
t.Parallel()
client, workspace, agentToken := setupWorkspaceForAgent(t)
_, _ = tGoContext(t, func(ctx context.Context) {
// Run this async so the SSH command has to wait for
// the build and agent to connect!
_ = agenttest.New(t, client.URL, agentToken)
<-ctx.Done()
})
clientOutput, clientInput := io.Pipe()
serverOutput, serverInput := io.Pipe()
defer func() {
for _, c := range []io.Closer{clientOutput, clientInput, serverOutput, serverInput} {
_ = c.Close()
}
}()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
inv, root := clitest.New(t, "ssh", "--stdio", workspace.Name)
clitest.SetupConfig(t, client, root)
inv.Stdin = clientOutput
inv.Stdout = serverInput
inv.Stderr = io.Discard
cmdDone := tGo(t, func() {
err := inv.WithContext(ctx).Run()
assert.NoError(t, err)
})
keySeed, err := agent.WorkspaceKeySeed(workspace.ID, "dev")
assert.NoError(t, err)
signer, err := agentssh.CoderSigner(keySeed)
assert.NoError(t, err)
conn, channels, requests, err := ssh.NewClientConn(&stdioConn{
Reader: serverOutput,
Writer: clientInput,
}, "", &ssh.ClientConfig{
HostKeyCallback: ssh.FixedHostKey(signer.PublicKey()),
})
require.NoError(t, err)
defer conn.Close()
sshClient := ssh.NewClient(conn, channels, requests)
session, err := sshClient.NewSession()
require.NoError(t, err)
defer session.Close()
command := "sh -c exit"
if runtime.GOOS == "windows" {
command = "cmd.exe /c exit"
}
err = session.Run(command)
require.NoError(t, err)
err = sshClient.Close()
require.NoError(t, err)
_ = clientOutput.Close()
<-cmdDone
})
t.Run("NetworkInfo", func(t *testing.T) {
t.Parallel()
client, workspace, agentToken := setupWorkspaceForAgent(t)