fix: Add resiliency to daemon connections (#1116)

Connections could fail when massive payloads were transmitted.
This fixes an upstream bug in dRPC where the connection would
end with a context canceled if a message was too large.

This adds retransmission of completion and failures too. If
Coder somehow loses connection with a provisioner daemon,
upon the next connection the state will be properly reported.
This commit is contained in:
Kyle Carberry
2022-04-24 20:33:19 -05:00
committed by GitHub
parent be974cf280
commit db7ed4d019
8 changed files with 351 additions and 65 deletions

View File

@ -17,6 +17,7 @@ import (
"github.com/moby/moby/pkg/namesgenerator"
"github.com/tabbed/pqtype"
"golang.org/x/xerrors"
protobuf "google.golang.org/protobuf/proto"
"nhooyr.io/websocket"
"storj.io/drpc/drpcmux"
"storj.io/drpc/drpcserver"
@ -27,6 +28,7 @@ import (
"github.com/coder/coder/coderd/httpapi"
"github.com/coder/coder/coderd/parameter"
"github.com/coder/coder/provisionerd/proto"
"github.com/coder/coder/provisionersdk"
sdkproto "github.com/coder/coder/provisionersdk/proto"
)
@ -47,6 +49,8 @@ func (api *api) provisionerDaemonsListen(rw http.ResponseWriter, r *http.Request
})
return
}
// Align with the frame size of yamux.
conn.SetReadLimit(256 * 1024)
daemon, err := api.Database.InsertProvisionerDaemon(r.Context(), database.InsertProvisionerDaemonParams{
ID: uuid.New(),
@ -82,9 +86,17 @@ func (api *api) provisionerDaemonsListen(rw http.ResponseWriter, r *http.Request
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("drpc register provisioner daemon: %s", err))
return
}
server := drpcserver.New(mux)
server := drpcserver.NewWithOptions(mux, drpcserver.Options{
Log: func(err error) {
if xerrors.Is(err, io.EOF) {
return
}
api.Logger.Debug(r.Context(), "drpc server error", slog.Error(err))
},
})
err = server.Serve(r.Context(), session)
if err != nil {
api.Logger.Debug(r.Context(), "provisioner daemon disconnected", slog.Error(err))
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("serve: %s", err))
return
}
@ -253,6 +265,9 @@ func (server *provisionerdServer) AcquireJob(ctx context.Context, _ *proto.Empty
default:
return nil, failJob(fmt.Sprintf("unsupported storage method: %s", job.StorageMethod))
}
if protobuf.Size(protoJob) > provisionersdk.MaxMessageSize {
return nil, failJob(fmt.Sprintf("payload was too big: %d > %d", protobuf.Size(protoJob), provisionersdk.MaxMessageSize))
}
return protoJob, err
}

View File

@ -0,0 +1,48 @@
package coderd_test
import (
"context"
"crypto/rand"
"runtime"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/codersdk"
"github.com/coder/coder/provisionersdk"
)
func TestProvisionerDaemons(t *testing.T) {
t.Parallel()
t.Run("PayloadTooBig", func(t *testing.T) {
t.Parallel()
if runtime.GOOS == "windows" {
// Takes too long to allocate memory on Windows!
t.Skip()
}
client := coderdtest.New(t, nil)
user := coderdtest.CreateFirstUser(t, client)
coderdtest.NewProvisionerDaemon(t, client)
data := make([]byte, provisionersdk.MaxMessageSize)
rand.Read(data)
resp, err := client.Upload(context.Background(), codersdk.ContentTypeTar, data)
require.NoError(t, err)
t.Log(resp.Hash)
version, err := client.CreateTemplateVersion(context.Background(), user.OrganizationID, codersdk.CreateTemplateVersionRequest{
StorageMethod: database.ProvisionerStorageMethodFile,
StorageSource: resp.Hash,
Provisioner: database.ProvisionerTypeEcho,
})
require.NoError(t, err)
require.Eventually(t, func() bool {
var err error
version, err = client.TemplateVersion(context.Background(), version.ID)
require.NoError(t, err)
return version.Job.Error != ""
}, 5*time.Second, 25*time.Millisecond)
})
}