mirror of
https://github.com/coder/coder.git
synced 2025-07-15 22:20:27 +00:00
feat: Add high availability for multiple replicas (#4555)
* feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * remove printlns * close all connections on coordinator * impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * Add replicas * Add DERP meshing to arbitrary addresses * Move packages to highavailability folder * Move coordinator to high availability package * Add flags for HA * Rename to replicasync * Denest packages for replicas * Add test for multiple replicas * Fix coordination test * Add HA to the helm chart * Rename function pointer * Add warnings for HA * Add the ability to block endpoints * Add flag to disable P2P connections * Wow, I made the tests pass * Add replicas endpoint * Ensure close kills replica * Update sql * Add database latency to high availability * Pipe TLS to DERP mesh * Fix DERP mesh with TLS * Add tests for TLS * Fix replica sync TLS * Fix RootCA for replica meshing * Remove ID from replicasync * Fix getting certificates for meshing * Remove excessive locking * Fix linting * Store mesh key in the database * Fix replica key for tests * Fix types gen * Fix unlocking unlocked * Fix race in tests * Update enterprise/derpmesh/derpmesh.go Co-authored-by: Colin Adler <colin1adler@gmail.com> * Rename to syncReplicas * Reuse http client * Delete old replicas on a CRON * Fix race condition in connection tests * Fix linting * Fix nil type * Move pubsub to in-memory for twenty test * Add comment for configuration tweaking * Fix leak with transport * Fix close leak in derpmesh * Fix race when creating server * Remove handler update * Skip test on Windows * Fix DERP mesh test * Wrap HTTP handler replacement in mutex * Fix error message for relay * Fix API handler for normal tests * Fix speedtest * Fix replica resend * Fix derpmesh send * Ping async * Increase wait time of template version jobd * Fix race when closing replica sync * Add name to client * Log the derpmap being used * Don't connect if DERP is empty * Improve agent coordinator logging * Fix lock in coordinator * Fix relay addr * Fix race when updating durations * Fix client publish race * Run pubsub loop in a queue * Store agent nodes in order * Fix coordinator locking * Check for closed pipe Co-authored-by: Colin Adler <colin1adler@gmail.com>
This commit is contained in:
@ -132,10 +132,10 @@ type AgentConn struct {
|
||||
CloseFunc func()
|
||||
}
|
||||
|
||||
func (c *AgentConn) Ping() (time.Duration, error) {
|
||||
func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) {
|
||||
errCh := make(chan error, 1)
|
||||
durCh := make(chan time.Duration, 1)
|
||||
c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
|
||||
go c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) {
|
||||
if pr.Err != "" {
|
||||
errCh <- xerrors.New(pr.Err)
|
||||
return
|
||||
@ -145,6 +145,8 @@ func (c *AgentConn) Ping() (time.Duration, error) {
|
||||
select {
|
||||
case err := <-errCh:
|
||||
return 0, err
|
||||
case <-ctx.Done():
|
||||
return 0, ctx.Err()
|
||||
case dur := <-durCh:
|
||||
return dur, nil
|
||||
}
|
||||
|
@ -15,12 +15,13 @@ const (
|
||||
)
|
||||
|
||||
const (
|
||||
FeatureUserLimit = "user_limit"
|
||||
FeatureAuditLog = "audit_log"
|
||||
FeatureBrowserOnly = "browser_only"
|
||||
FeatureSCIM = "scim"
|
||||
FeatureWorkspaceQuota = "workspace_quota"
|
||||
FeatureTemplateRBAC = "template_rbac"
|
||||
FeatureUserLimit = "user_limit"
|
||||
FeatureAuditLog = "audit_log"
|
||||
FeatureBrowserOnly = "browser_only"
|
||||
FeatureSCIM = "scim"
|
||||
FeatureWorkspaceQuota = "workspace_quota"
|
||||
FeatureTemplateRBAC = "template_rbac"
|
||||
FeatureHighAvailability = "high_availability"
|
||||
)
|
||||
|
||||
var FeatureNames = []string{
|
||||
@ -30,6 +31,7 @@ var FeatureNames = []string{
|
||||
FeatureSCIM,
|
||||
FeatureWorkspaceQuota,
|
||||
FeatureTemplateRBAC,
|
||||
FeatureHighAvailability,
|
||||
}
|
||||
|
||||
type Feature struct {
|
||||
@ -42,6 +44,7 @@ type Feature struct {
|
||||
type Entitlements struct {
|
||||
Features map[string]Feature `json:"features"`
|
||||
Warnings []string `json:"warnings"`
|
||||
Errors []string `json:"errors"`
|
||||
HasLicense bool `json:"has_license"`
|
||||
Experimental bool `json:"experimental"`
|
||||
Trial bool `json:"trial"`
|
||||
|
@ -19,6 +19,7 @@ type DeploymentFlags struct {
|
||||
DerpServerRegionCode *StringFlag `json:"derp_server_region_code" typescript:",notnull"`
|
||||
DerpServerRegionName *StringFlag `json:"derp_server_region_name" typescript:",notnull"`
|
||||
DerpServerSTUNAddresses *StringArrayFlag `json:"derp_server_stun_address" typescript:",notnull"`
|
||||
DerpServerRelayAddress *StringFlag `json:"derp_server_relay_address" typescript:",notnull"`
|
||||
DerpConfigURL *StringFlag `json:"derp_config_url" typescript:",notnull"`
|
||||
DerpConfigPath *StringFlag `json:"derp_config_path" typescript:",notnull"`
|
||||
PromEnabled *BoolFlag `json:"prom_enabled" typescript:",notnull"`
|
||||
|
44
codersdk/replicas.go
Normal file
44
codersdk/replicas.go
Normal file
@ -0,0 +1,44 @@
|
||||
package codersdk
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/xerrors"
|
||||
)
|
||||
|
||||
type Replica struct {
|
||||
// ID is the unique identifier for the replica.
|
||||
ID uuid.UUID `json:"id"`
|
||||
// Hostname is the hostname of the replica.
|
||||
Hostname string `json:"hostname"`
|
||||
// CreatedAt is when the replica was first seen.
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
// RelayAddress is the accessible address to relay DERP connections.
|
||||
RelayAddress string `json:"relay_address"`
|
||||
// RegionID is the region of the replica.
|
||||
RegionID int32 `json:"region_id"`
|
||||
// Error is the error.
|
||||
Error string `json:"error"`
|
||||
// DatabaseLatency is the latency in microseconds to the database.
|
||||
DatabaseLatency int32 `json:"database_latency"`
|
||||
}
|
||||
|
||||
// Replicas fetches the list of replicas.
|
||||
func (c *Client) Replicas(ctx context.Context) ([]Replica, error) {
|
||||
res, err := c.Request(ctx, http.MethodGet, "/api/v2/replicas", nil)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("execute request: %w", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return nil, readBodyAsError(res)
|
||||
}
|
||||
|
||||
var replicas []Replica
|
||||
return replicas, json.NewDecoder(res.Body).Decode(&replicas)
|
||||
}
|
@ -21,7 +21,6 @@ import (
|
||||
"tailscale.com/tailcfg"
|
||||
|
||||
"cdr.dev/slog"
|
||||
|
||||
"github.com/coder/coder/tailnet"
|
||||
"github.com/coder/retry"
|
||||
)
|
||||
@ -316,7 +315,8 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err
|
||||
Value: c.SessionToken,
|
||||
}})
|
||||
httpClient := &http.Client{
|
||||
Jar: jar,
|
||||
Jar: jar,
|
||||
Transport: c.HTTPClient.Transport,
|
||||
}
|
||||
// nolint:bodyclose
|
||||
conn, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{
|
||||
@ -332,7 +332,17 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err
|
||||
return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil
|
||||
}
|
||||
|
||||
func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logger, agentID uuid.UUID) (*AgentConn, error) {
|
||||
// @typescript-ignore DialWorkspaceAgentOptions
|
||||
type DialWorkspaceAgentOptions struct {
|
||||
Logger slog.Logger
|
||||
// BlockEndpoints forced a direct connection through DERP.
|
||||
BlockEndpoints bool
|
||||
}
|
||||
|
||||
func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (*AgentConn, error) {
|
||||
if options == nil {
|
||||
options = &DialWorkspaceAgentOptions{}
|
||||
}
|
||||
res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -349,9 +359,10 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
|
||||
|
||||
ip := tailnet.IP()
|
||||
conn, err := tailnet.NewConn(&tailnet.Options{
|
||||
Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)},
|
||||
DERPMap: connInfo.DERPMap,
|
||||
Logger: logger,
|
||||
Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)},
|
||||
DERPMap: connInfo.DERPMap,
|
||||
Logger: options.Logger,
|
||||
BlockEndpoints: options.BlockEndpoints,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("create tailnet: %w", err)
|
||||
@ -370,7 +381,8 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
|
||||
Value: c.SessionToken,
|
||||
}})
|
||||
httpClient := &http.Client{
|
||||
Jar: jar,
|
||||
Jar: jar,
|
||||
Transport: c.HTTPClient.Transport,
|
||||
}
|
||||
ctx, cancelFunc := context.WithCancel(ctx)
|
||||
closed := make(chan struct{})
|
||||
@ -379,7 +391,7 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
|
||||
defer close(closed)
|
||||
isFirst := true
|
||||
for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
|
||||
logger.Debug(ctx, "connecting")
|
||||
options.Logger.Debug(ctx, "connecting")
|
||||
// nolint:bodyclose
|
||||
ws, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{
|
||||
HTTPClient: httpClient,
|
||||
@ -398,21 +410,21 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return
|
||||
}
|
||||
logger.Debug(ctx, "failed to dial", slog.Error(err))
|
||||
options.Logger.Debug(ctx, "failed to dial", slog.Error(err))
|
||||
continue
|
||||
}
|
||||
sendNode, errChan := tailnet.ServeCoordinator(websocket.NetConn(ctx, ws, websocket.MessageBinary), func(node []*tailnet.Node) error {
|
||||
return conn.UpdateNodes(node)
|
||||
})
|
||||
conn.SetNodeCallback(sendNode)
|
||||
logger.Debug(ctx, "serving coordinator")
|
||||
options.Logger.Debug(ctx, "serving coordinator")
|
||||
err = <-errChan
|
||||
if errors.Is(err, context.Canceled) {
|
||||
_ = ws.Close(websocket.StatusGoingAway, "")
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
logger.Debug(ctx, "error serving coordinator", slog.Error(err))
|
||||
options.Logger.Debug(ctx, "error serving coordinator", slog.Error(err))
|
||||
_ = ws.Close(websocket.StatusGoingAway, "")
|
||||
continue
|
||||
}
|
||||
|
Reference in New Issue
Block a user