fix(tailnet): prevent redial after Coord graceful restart (#15586)

fixes: https://github.com/coder/internal/issues/217

> There are a couple problems:
>
> One is that we assert the RPCs succeed, but if the pipeDialer context is canceled at the end of the test, then these assertions happen after the test is officially complete, which panics and affects other tests.

This converts these to just return the error rather than assert.

> The other is that the retrier is slightly bugged: if the current retry delay is 0 AND the ctx is done, (e.g. after successfully connecting and then gracefully disconnecting), then retrier.Wait(c.ctx) is racy and could return either true or false.

Fixes the phantom redial by explicitly checking the context before dialing. Also, in the test, we assert that the controller is closed before completing the test.
This commit is contained in:
Spike Curtis
2024-11-19 11:37:11 +04:00
committed by GitHub
parent 85c3c4c025
commit 029cd5d064
2 changed files with 17 additions and 6 deletions

View File

@ -1193,10 +1193,17 @@ func (c *Controller) Run(ctx context.Context) {
// Sadly retry doesn't support quartz.Clock yet so this is not
// influenced by the configured clock.
for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(c.ctx); {
// Check the context again before dialing, since `retrier.Wait()` could return true
// if the delay is 0, even if the context was canceled. This ensures we don't redial
// after a graceful shutdown.
if c.ctx.Err() != nil {
return
}
tailnetClients, err := c.Dialer.Dial(c.ctx, c.ResumeTokenCtrl)
if err != nil {
if xerrors.Is(err, context.Canceled) {
continue
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
return
}
errF := slog.Error(err)
var sdkErr *codersdk.Error

View File

@ -14,7 +14,6 @@ import (
"github.com/google/uuid"
"github.com/hashicorp/yamux"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/mock/gomock"
"golang.org/x/xerrors"
@ -1071,6 +1070,7 @@ func TestController_Disconnects(t *testing.T) {
close(call.Resps)
_ = testutil.RequireRecvCtx(testCtx, t, peersLost)
_ = testutil.RequireRecvCtx(testCtx, t, uut.Closed())
}
func TestController_TelemetrySuccess(t *testing.T) {
@ -1210,24 +1210,28 @@ type pipeDialer struct {
func (p *pipeDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
s, c := net.Pipe()
p.t.Cleanup(func() {
_ = s.Close()
_ = c.Close()
})
go func() {
err := p.svc.ServeConnV2(p.ctx, s, p.streamID)
p.logger.Debug(p.ctx, "piped tailnet service complete", slog.Error(err))
}()
client, err := tailnet.NewDRPCClient(c, p.logger)
if !assert.NoError(p.t, err) {
if err != nil {
_ = c.Close()
return tailnet.ControlProtocolClients{}, err
}
coord, err := client.Coordinate(context.Background())
if !assert.NoError(p.t, err) {
if err != nil {
_ = c.Close()
return tailnet.ControlProtocolClients{}, err
}
derps := &tailnet.DERPFromDRPCWrapper{}
derps.Client, err = client.StreamDERPMaps(context.Background(), &proto.StreamDERPMapsRequest{})
if !assert.NoError(p.t, err) {
if err != nil {
_ = c.Close()
return tailnet.ControlProtocolClients{}, err
}