feat: use tailnet v2 API for coordination (#11638)

This one is huge, and I'm sorry.

The problem is that once I change `tailnet.Conn` to start doing v2 behavior, I kind of have to change it everywhere, including in CoderSDK (CLI), the agent, wsproxy, and ServerTailnet.

There is still a bit more cleanup to do, and I need to add code so that when we lose connection to the Coordinator, we mark all peers as LOST, but that will be in a separate PR since this is big enough!
This commit is contained in:
Spike Curtis
2024-01-22 11:07:50 +04:00
committed by GitHub
parent 5a2cf7cd14
commit f01cab9894
31 changed files with 1192 additions and 1114 deletions

View File

@ -12,14 +12,19 @@ import (
"net/url"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
"go.uber.org/goleak"
"golang.org/x/xerrors"
"storj.io/drpc"
"storj.io/drpc/drpcmux"
"storj.io/drpc/drpcserver"
"tailscale.com/tailcfg"
"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"
@ -27,7 +32,9 @@ import (
"github.com/coder/coder/v2/coderd/wsconncache"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
drpcsdk "github.com/coder/coder/v2/codersdk/drpc"
"github.com/coder/coder/v2/tailnet"
"github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/coder/v2/tailnet/tailnettest"
"github.com/coder/coder/v2/testutil"
)
@ -41,7 +48,7 @@ func TestCache(t *testing.T) {
t.Run("Same", func(t *testing.T) {
t.Parallel()
cache := wsconncache.New(func(id uuid.UUID) (*codersdk.WorkspaceAgentConn, error) {
return setupAgent(t, agentsdk.Manifest{}, 0), nil
return setupAgent(t, agentsdk.Manifest{}, 0)
}, 0)
defer func() {
_ = cache.Close()
@ -54,10 +61,10 @@ func TestCache(t *testing.T) {
})
t.Run("Expire", func(t *testing.T) {
t.Parallel()
called := atomic.NewInt32(0)
called := int32(0)
cache := wsconncache.New(func(id uuid.UUID) (*codersdk.WorkspaceAgentConn, error) {
called.Add(1)
return setupAgent(t, agentsdk.Manifest{}, 0), nil
atomic.AddInt32(&called, 1)
return setupAgent(t, agentsdk.Manifest{}, 0)
}, time.Microsecond)
defer func() {
_ = cache.Close()
@ -70,12 +77,12 @@ func TestCache(t *testing.T) {
require.NoError(t, err)
release()
<-conn.Closed()
require.Equal(t, int32(2), called.Load())
require.Equal(t, int32(2), called)
})
t.Run("NoExpireWhenLocked", func(t *testing.T) {
t.Parallel()
cache := wsconncache.New(func(id uuid.UUID) (*codersdk.WorkspaceAgentConn, error) {
return setupAgent(t, agentsdk.Manifest{}, 0), nil
return setupAgent(t, agentsdk.Manifest{}, 0)
}, time.Microsecond)
defer func() {
_ = cache.Close()
@ -108,7 +115,7 @@ func TestCache(t *testing.T) {
go server.Serve(random)
cache := wsconncache.New(func(id uuid.UUID) (*codersdk.WorkspaceAgentConn, error) {
return setupAgent(t, agentsdk.Manifest{}, 0), nil
return setupAgent(t, agentsdk.Manifest{}, 0)
}, time.Microsecond)
defer func() {
_ = cache.Close()
@ -154,7 +161,7 @@ func TestCache(t *testing.T) {
})
}
func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Duration) *codersdk.WorkspaceAgentConn {
func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Duration) (*codersdk.WorkspaceAgentConn, error) {
t.Helper()
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
manifest.DERPMap, _ = tailnettest.RunDERPAndSTUN(t)
@ -184,18 +191,25 @@ func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Durati
DERPForceWebSockets: manifest.DERPForceWebSockets,
Logger: slogtest.Make(t, nil).Named("tailnet").Leveled(slog.LevelDebug),
})
require.NoError(t, err)
clientConn, serverConn := net.Pipe()
// setupAgent is called by wsconncache Dialer, so we can't use require here as it will end the
// test, which in turn closes the wsconncache, which in turn waits for the Dialer and deadlocks.
if !assert.NoError(t, err) {
return nil, err
}
t.Cleanup(func() {
_ = clientConn.Close()
_ = serverConn.Close()
_ = conn.Close()
})
go coordinator.ServeClient(serverConn, uuid.New(), manifest.AgentID)
sendNode, _ := tailnet.ServeCoordinator(clientConn, func(nodes []*tailnet.Node) error {
return conn.UpdateNodes(nodes, false)
clientID := uuid.New()
testCtx, testCtxCancel := context.WithCancel(context.Background())
t.Cleanup(testCtxCancel)
coordination := tailnet.NewInMemoryCoordination(
testCtx, logger,
clientID, manifest.AgentID,
coordinator, conn,
)
t.Cleanup(func() {
_ = coordination.Close()
})
conn.SetNodeCallback(sendNode)
agentConn := codersdk.NewWorkspaceAgentConn(conn, codersdk.WorkspaceAgentConnOptions{
AgentID: manifest.AgentID,
AgentIP: codersdk.WorkspaceAgentIP,
@ -206,16 +220,20 @@ func setupAgent(t *testing.T, manifest agentsdk.Manifest, ptyTimeout time.Durati
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitMedium)
defer cancel()
if !agentConn.AwaitReachable(ctx) {
t.Fatal("agent not reachable")
// setupAgent is called by wsconncache Dialer, so we can't use t.Fatal here as it will end
// the test, which in turn closes the wsconncache, which in turn waits for the Dialer and
// deadlocks.
t.Error("agent not reachable")
return nil, xerrors.New("agent not reachable")
}
return agentConn
return agentConn, nil
}
type client struct {
t *testing.T
agentID uuid.UUID
manifest agentsdk.Manifest
coordinator tailnet.CoordinatorV1
coordinator tailnet.Coordinator
}
func (c *client) Manifest(_ context.Context) (agentsdk.Manifest, error) {
@ -240,19 +258,53 @@ func (*client) DERPMapUpdates(_ context.Context) (<-chan agentsdk.DERPMapUpdate,
}, nil
}
func (c *client) Listen(_ context.Context) (net.Conn, error) {
clientConn, serverConn := net.Pipe()
func (c *client) Listen(_ context.Context) (drpc.Conn, error) {
logger := slogtest.Make(c.t, nil).Leveled(slog.LevelDebug).Named("drpc")
conn, lis := drpcsdk.MemTransportPipe()
closed := make(chan struct{})
c.t.Cleanup(func() {
_ = serverConn.Close()
_ = clientConn.Close()
_ = conn.Close()
_ = lis.Close()
<-closed
})
coordPtr := atomic.Pointer[tailnet.Coordinator]{}
coordPtr.Store(&c.coordinator)
mux := drpcmux.New()
drpcService := &tailnet.DRPCService{
CoordPtr: &coordPtr,
Logger: logger,
// TODO: handle DERPMap too!
DerpMapUpdateFrequency: time.Hour,
DerpMapFn: func() *tailcfg.DERPMap { panic("not implemented") },
}
err := proto.DRPCRegisterTailnet(mux, drpcService)
if err != nil {
return nil, xerrors.Errorf("register DRPC service: %w", err)
}
server := drpcserver.NewWithOptions(mux, drpcserver.Options{
Log: func(err error) {
if xerrors.Is(err, io.EOF) ||
xerrors.Is(err, context.Canceled) ||
xerrors.Is(err, context.DeadlineExceeded) {
return
}
logger.Debug(context.Background(), "drpc server error", slog.Error(err))
},
})
serveCtx, cancel := context.WithCancel(context.Background())
c.t.Cleanup(cancel)
auth := tailnet.AgentTunnelAuth{}
streamID := tailnet.StreamID{
Name: "wsconncache_test-agent",
ID: c.agentID,
Auth: auth,
}
serveCtx = tailnet.WithStreamID(serveCtx, streamID)
go func() {
_ = c.coordinator.ServeAgent(serverConn, c.agentID, "")
server.Serve(serveCtx, lis)
close(closed)
}()
return clientConn, nil
return conn, nil
}
func (*client) ReportStats(_ context.Context, _ slog.Logger, _ <-chan *agentsdk.Stats, _ func(time.Duration)) (io.Closer, error) {