feat: Add high availability for multiple replicas (#4555)

* feat: HA tailnet coordinator

* fixup! feat: HA tailnet coordinator

* fixup! feat: HA tailnet coordinator

* remove printlns

* close all connections on coordinator

* impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* Add replicas

* Add DERP meshing to arbitrary addresses

* Move packages to highavailability folder

* Move coordinator to high availability package

* Add flags for HA

* Rename to replicasync

* Denest packages for replicas

* Add test for multiple replicas

* Fix coordination test

* Add HA to the helm chart

* Rename function pointer

* Add warnings for HA

* Add the ability to block endpoints

* Add flag to disable P2P connections

* Wow, I made the tests pass

* Add replicas endpoint

* Ensure close kills replica

* Update sql

* Add database latency to high availability

* Pipe TLS to DERP mesh

* Fix DERP mesh with TLS

* Add tests for TLS

* Fix replica sync TLS

* Fix RootCA for replica meshing

* Remove ID from replicasync

* Fix getting certificates for meshing

* Remove excessive locking

* Fix linting

* Store mesh key in the database

* Fix replica key for tests

* Fix types gen

* Fix unlocking unlocked

* Fix race in tests

* Update enterprise/derpmesh/derpmesh.go

Co-authored-by: Colin Adler <colin1adler@gmail.com>

* Rename to syncReplicas

* Reuse http client

* Delete old replicas on a CRON

* Fix race condition in connection tests

* Fix linting

* Fix nil type

* Move pubsub to in-memory for twenty test

* Add comment for configuration tweaking

* Fix leak with transport

* Fix close leak in derpmesh

* Fix race when creating server

* Remove handler update

* Skip test on Windows

* Fix DERP mesh test

* Wrap HTTP handler replacement in mutex

* Fix error message for relay

* Fix API handler for normal tests

* Fix speedtest

* Fix replica resend

* Fix derpmesh send

* Ping async

* Increase wait time of template version jobd

* Fix race when closing replica sync

* Add name to client

* Log the derpmap being used

* Don't connect if DERP is empty

* Improve agent coordinator logging

* Fix lock in coordinator

* Fix relay addr

* Fix race when updating durations

* Fix client publish race

* Run pubsub loop in a queue

* Store agent nodes in order

* Fix coordinator locking

* Check for closed pipe

Co-authored-by: Colin Adler <colin1adler@gmail.com>
This commit is contained in:
Kyle Carberry
2022-10-17 08:43:30 -05:00
committed by GitHub
parent dc3519e973
commit 2ba4a62a0d
76 changed files with 3437 additions and 404 deletions

View File

@ -132,10 +132,10 @@ type AgentConn struct {
CloseFunc func()
}
func (c *AgentConn) Ping() (time.Duration, error) {
func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) {
errCh := make(chan error, 1)
durCh := make(chan time.Duration, 1)
c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
go c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
if pr.Err != "" {
errCh <- xerrors.New(pr.Err)
return
@ -145,6 +145,8 @@ func (c *AgentConn) Ping() (time.Duration, error) {
select {
case err := <-errCh:
return 0, err
case <-ctx.Done():
return 0, ctx.Err()
case dur := <-durCh:
return dur, nil
}

View File

@ -15,12 +15,13 @@ const (
)
const (
FeatureUserLimit = "user_limit"
FeatureAuditLog = "audit_log"
FeatureBrowserOnly = "browser_only"
FeatureSCIM = "scim"
FeatureWorkspaceQuota = "workspace_quota"
FeatureTemplateRBAC = "template_rbac"
FeatureUserLimit = "user_limit"
FeatureAuditLog = "audit_log"
FeatureBrowserOnly = "browser_only"
FeatureSCIM = "scim"
FeatureWorkspaceQuota = "workspace_quota"
FeatureTemplateRBAC = "template_rbac"
FeatureHighAvailability = "high_availability"
)
var FeatureNames = []string{
@ -30,6 +31,7 @@ var FeatureNames = []string{
FeatureSCIM,
FeatureWorkspaceQuota,
FeatureTemplateRBAC,
FeatureHighAvailability,
}
type Feature struct {
@ -42,6 +44,7 @@ type Feature struct {
type Entitlements struct {
Features map[string]Feature `json:"features"`
Warnings []string `json:"warnings"`
Errors []string `json:"errors"`
HasLicense bool `json:"has_license"`
Experimental bool `json:"experimental"`
Trial bool `json:"trial"`

View File

@ -19,6 +19,7 @@ type DeploymentFlags struct {
DerpServerRegionCode *StringFlag `json:"derp_server_region_code" typescript:",notnull"`
DerpServerRegionName *StringFlag `json:"derp_server_region_name" typescript:",notnull"`
DerpServerSTUNAddresses *StringArrayFlag `json:"derp_server_stun_address" typescript:",notnull"`
DerpServerRelayAddress *StringFlag `json:"derp_server_relay_address" typescript:",notnull"`
DerpConfigURL *StringFlag `json:"derp_config_url" typescript:",notnull"`
DerpConfigPath *StringFlag `json:"derp_config_path" typescript:",notnull"`
PromEnabled *BoolFlag `json:"prom_enabled" typescript:",notnull"`

44
codersdk/replicas.go Normal file
View File

@ -0,0 +1,44 @@
package codersdk
import (
"context"
"encoding/json"
"net/http"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
)
type Replica struct {
// ID is the unique identifier for the replica.
ID uuid.UUID `json:"id"`
// Hostname is the hostname of the replica.
Hostname string `json:"hostname"`
// CreatedAt is when the replica was first seen.
CreatedAt time.Time `json:"created_at"`
// RelayAddress is the accessible address to relay DERP connections.
RelayAddress string `json:"relay_address"`
// RegionID is the region of the replica.
RegionID int32 `json:"region_id"`
// Error is the error.
Error string `json:"error"`
// DatabaseLatency is the latency in microseconds to the database.
DatabaseLatency int32 `json:"database_latency"`
}
// Replicas fetches the list of replicas.
func (c *Client) Replicas(ctx context.Context) ([]Replica, error) {
res, err := c.Request(ctx, http.MethodGet, "/api/v2/replicas", nil)
if err != nil {
return nil, xerrors.Errorf("execute request: %w", err)
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, readBodyAsError(res)
}
var replicas []Replica
return replicas, json.NewDecoder(res.Body).Decode(&replicas)
}

View File

@ -21,7 +21,6 @@ import (
"tailscale.com/tailcfg"
"cdr.dev/slog"
"github.com/coder/coder/tailnet"
"github.com/coder/retry"
)
@ -316,7 +315,8 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err
Value: c.SessionToken,
}})
httpClient := &http.Client{
Jar: jar,
Jar: jar,
Transport: c.HTTPClient.Transport,
}
// nolint:bodyclose
conn, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{
@ -332,7 +332,17 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err
return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil
}
func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logger, agentID uuid.UUID) (*AgentConn, error) {
// @typescript-ignore DialWorkspaceAgentOptions
type DialWorkspaceAgentOptions struct {
Logger slog.Logger
// BlockEndpoints forced a direct connection through DERP.
BlockEndpoints bool
}
func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (*AgentConn, error) {
if options == nil {
options = &DialWorkspaceAgentOptions{}
}
res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil)
if err != nil {
return nil, err
@ -349,9 +359,10 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
ip := tailnet.IP()
conn, err := tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)},
DERPMap: connInfo.DERPMap,
Logger: logger,
Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)},
DERPMap: connInfo.DERPMap,
Logger: options.Logger,
BlockEndpoints: options.BlockEndpoints,
})
if err != nil {
return nil, xerrors.Errorf("create tailnet: %w", err)
@ -370,7 +381,8 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
Value: c.SessionToken,
}})
httpClient := &http.Client{
Jar: jar,
Jar: jar,
Transport: c.HTTPClient.Transport,
}
ctx, cancelFunc := context.WithCancel(ctx)
closed := make(chan struct{})
@ -379,7 +391,7 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
defer close(closed)
isFirst := true
for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
logger.Debug(ctx, "connecting")
options.Logger.Debug(ctx, "connecting")
// nolint:bodyclose
ws, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{
HTTPClient: httpClient,
@ -398,21 +410,21 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
if errors.Is(err, context.Canceled) {
return
}
logger.Debug(ctx, "failed to dial", slog.Error(err))
options.Logger.Debug(ctx, "failed to dial", slog.Error(err))
continue
}
sendNode, errChan := tailnet.ServeCoordinator(websocket.NetConn(ctx, ws, websocket.MessageBinary), func(node []*tailnet.Node) error {
return conn.UpdateNodes(node)
})
conn.SetNodeCallback(sendNode)
logger.Debug(ctx, "serving coordinator")
options.Logger.Debug(ctx, "serving coordinator")
err = <-errChan
if errors.Is(err, context.Canceled) {
_ = ws.Close(websocket.StatusGoingAway, "")
return
}
if err != nil {
logger.Debug(ctx, "error serving coordinator", slog.Error(err))
options.Logger.Debug(ctx, "error serving coordinator", slog.Error(err))
_ = ws.Close(websocket.StatusGoingAway, "")
continue
}