mirror of
https://github.com/coder/coder.git
synced 2025-07-13 21:36:50 +00:00
fix: Add resiliency to daemon connections (#1116)
Connections could fail when massive payloads were transmitted. This fixes an upstream bug in dRPC where the connection would end with a context canceled if a message was too large. This adds retransmission of completion and failures too. If Coder somehow loses connection with a provisioner daemon, upon the next connection the state will be properly reported.
This commit is contained in:
@ -17,6 +17,7 @@ import (
|
||||
"github.com/moby/moby/pkg/namesgenerator"
|
||||
"github.com/tabbed/pqtype"
|
||||
"golang.org/x/xerrors"
|
||||
protobuf "google.golang.org/protobuf/proto"
|
||||
"nhooyr.io/websocket"
|
||||
"storj.io/drpc/drpcmux"
|
||||
"storj.io/drpc/drpcserver"
|
||||
@ -27,6 +28,7 @@ import (
|
||||
"github.com/coder/coder/coderd/httpapi"
|
||||
"github.com/coder/coder/coderd/parameter"
|
||||
"github.com/coder/coder/provisionerd/proto"
|
||||
"github.com/coder/coder/provisionersdk"
|
||||
sdkproto "github.com/coder/coder/provisionersdk/proto"
|
||||
)
|
||||
|
||||
@ -47,6 +49,8 @@ func (api *api) provisionerDaemonsListen(rw http.ResponseWriter, r *http.Request
|
||||
})
|
||||
return
|
||||
}
|
||||
// Align with the frame size of yamux.
|
||||
conn.SetReadLimit(256 * 1024)
|
||||
|
||||
daemon, err := api.Database.InsertProvisionerDaemon(r.Context(), database.InsertProvisionerDaemonParams{
|
||||
ID: uuid.New(),
|
||||
@ -82,9 +86,17 @@ func (api *api) provisionerDaemonsListen(rw http.ResponseWriter, r *http.Request
|
||||
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("drpc register provisioner daemon: %s", err))
|
||||
return
|
||||
}
|
||||
server := drpcserver.New(mux)
|
||||
server := drpcserver.NewWithOptions(mux, drpcserver.Options{
|
||||
Log: func(err error) {
|
||||
if xerrors.Is(err, io.EOF) {
|
||||
return
|
||||
}
|
||||
api.Logger.Debug(r.Context(), "drpc server error", slog.Error(err))
|
||||
},
|
||||
})
|
||||
err = server.Serve(r.Context(), session)
|
||||
if err != nil {
|
||||
api.Logger.Debug(r.Context(), "provisioner daemon disconnected", slog.Error(err))
|
||||
_ = conn.Close(websocket.StatusInternalError, httpapi.WebsocketCloseSprintf("serve: %s", err))
|
||||
return
|
||||
}
|
||||
@ -253,6 +265,9 @@ func (server *provisionerdServer) AcquireJob(ctx context.Context, _ *proto.Empty
|
||||
default:
|
||||
return nil, failJob(fmt.Sprintf("unsupported storage method: %s", job.StorageMethod))
|
||||
}
|
||||
if protobuf.Size(protoJob) > provisionersdk.MaxMessageSize {
|
||||
return nil, failJob(fmt.Sprintf("payload was too big: %d > %d", protobuf.Size(protoJob), provisionersdk.MaxMessageSize))
|
||||
}
|
||||
|
||||
return protoJob, err
|
||||
}
|
||||
|
48
coderd/provisionerdaemons_test.go
Normal file
48
coderd/provisionerdaemons_test.go
Normal file
@ -0,0 +1,48 @@
|
||||
package coderd_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"runtime"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/coder/coder/coderd/coderdtest"
|
||||
"github.com/coder/coder/coderd/database"
|
||||
"github.com/coder/coder/codersdk"
|
||||
"github.com/coder/coder/provisionersdk"
|
||||
)
|
||||
|
||||
func TestProvisionerDaemons(t *testing.T) {
|
||||
t.Parallel()
|
||||
t.Run("PayloadTooBig", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
if runtime.GOOS == "windows" {
|
||||
// Takes too long to allocate memory on Windows!
|
||||
t.Skip()
|
||||
}
|
||||
client := coderdtest.New(t, nil)
|
||||
user := coderdtest.CreateFirstUser(t, client)
|
||||
coderdtest.NewProvisionerDaemon(t, client)
|
||||
data := make([]byte, provisionersdk.MaxMessageSize)
|
||||
rand.Read(data)
|
||||
resp, err := client.Upload(context.Background(), codersdk.ContentTypeTar, data)
|
||||
require.NoError(t, err)
|
||||
t.Log(resp.Hash)
|
||||
|
||||
version, err := client.CreateTemplateVersion(context.Background(), user.OrganizationID, codersdk.CreateTemplateVersionRequest{
|
||||
StorageMethod: database.ProvisionerStorageMethodFile,
|
||||
StorageSource: resp.Hash,
|
||||
Provisioner: database.ProvisionerTypeEcho,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Eventually(t, func() bool {
|
||||
var err error
|
||||
version, err = client.TemplateVersion(context.Background(), version.ID)
|
||||
require.NoError(t, err)
|
||||
return version.Job.Error != ""
|
||||
}, 5*time.Second, 25*time.Millisecond)
|
||||
})
|
||||
}
|
Reference in New Issue
Block a user