mirror of
https://github.com/coder/coder.git
synced 2025-07-06 15:41:45 +00:00
feat: Add high availability for multiple replicas (#4555)
* feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * remove printlns * close all connections on coordinator * impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * Add replicas * Add DERP meshing to arbitrary addresses * Move packages to highavailability folder * Move coordinator to high availability package * Add flags for HA * Rename to replicasync * Denest packages for replicas * Add test for multiple replicas * Fix coordination test * Add HA to the helm chart * Rename function pointer * Add warnings for HA * Add the ability to block endpoints * Add flag to disable P2P connections * Wow, I made the tests pass * Add replicas endpoint * Ensure close kills replica * Update sql * Add database latency to high availability * Pipe TLS to DERP mesh * Fix DERP mesh with TLS * Add tests for TLS * Fix replica sync TLS * Fix RootCA for replica meshing * Remove ID from replicasync * Fix getting certificates for meshing * Remove excessive locking * Fix linting * Store mesh key in the database * Fix replica key for tests * Fix types gen * Fix unlocking unlocked * Fix race in tests * Update enterprise/derpmesh/derpmesh.go Co-authored-by: Colin Adler <colin1adler@gmail.com> * Rename to syncReplicas * Reuse http client * Delete old replicas on a CRON * Fix race condition in connection tests * Fix linting * Fix nil type * Move pubsub to in-memory for twenty test * Add comment for configuration tweaking * Fix leak with transport * Fix close leak in derpmesh * Fix race when creating server * Remove handler update * Skip test on Windows * Fix DERP mesh test * Wrap HTTP handler replacement in mutex * Fix error message for relay * Fix API handler for normal tests * Fix speedtest * Fix replica resend * Fix derpmesh send * Ping async * Increase wait time of template version jobd * Fix race when closing replica sync * Add name to client * Log the derpmap being used * Don't connect if DERP is empty * Improve agent coordinator logging * Fix lock in coordinator * Fix relay addr * Fix race when updating durations * Fix client publish race * Run pubsub loop in a queue * Store agent nodes in order * Fix coordinator locking * Check for closed pipe Co-authored-by: Colin Adler <colin1adler@gmail.com>
This commit is contained in:
@ -3,6 +3,8 @@ package coderd
|
||||
import (
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
@ -23,6 +25,10 @@ import (
|
||||
"github.com/coder/coder/enterprise/audit"
|
||||
"github.com/coder/coder/enterprise/audit/backends"
|
||||
"github.com/coder/coder/enterprise/coderd/license"
|
||||
"github.com/coder/coder/enterprise/derpmesh"
|
||||
"github.com/coder/coder/enterprise/replicasync"
|
||||
"github.com/coder/coder/enterprise/tailnet"
|
||||
agpltailnet "github.com/coder/coder/tailnet"
|
||||
)
|
||||
|
||||
// New constructs an Enterprise coderd API instance.
|
||||
@ -47,6 +53,7 @@ func New(ctx context.Context, options *Options) (*API, error) {
|
||||
Options: options,
|
||||
cancelEntitlementsLoop: cancelFunc,
|
||||
}
|
||||
|
||||
oauthConfigs := &httpmw.OAuth2Configs{
|
||||
Github: options.GithubOAuth2Config,
|
||||
OIDC: options.OIDCConfig,
|
||||
@ -59,6 +66,10 @@ func New(ctx context.Context, options *Options) (*API, error) {
|
||||
|
||||
api.AGPL.APIHandler.Group(func(r chi.Router) {
|
||||
r.Get("/entitlements", api.serveEntitlements)
|
||||
r.Route("/replicas", func(r chi.Router) {
|
||||
r.Use(apiKeyMiddleware)
|
||||
r.Get("/", api.replicas)
|
||||
})
|
||||
r.Route("/licenses", func(r chi.Router) {
|
||||
r.Use(apiKeyMiddleware)
|
||||
r.Post("/", api.postLicense)
|
||||
@ -117,7 +128,40 @@ func New(ctx context.Context, options *Options) (*API, error) {
|
||||
})
|
||||
}
|
||||
|
||||
err := api.updateEntitlements(ctx)
|
||||
meshRootCA := x509.NewCertPool()
|
||||
for _, certificate := range options.TLSCertificates {
|
||||
for _, certificatePart := range certificate.Certificate {
|
||||
certificate, err := x509.ParseCertificate(certificatePart)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err)
|
||||
}
|
||||
meshRootCA.AddCert(certificate)
|
||||
}
|
||||
}
|
||||
// This TLS configuration spoofs access from the access URL hostname
|
||||
// assuming that the certificates provided will cover that hostname.
|
||||
//
|
||||
// Replica sync and DERP meshing require accessing replicas via their
|
||||
// internal IP addresses, and if TLS is configured we use the same
|
||||
// certificates.
|
||||
meshTLSConfig := &tls.Config{
|
||||
MinVersion: tls.VersionTLS12,
|
||||
Certificates: options.TLSCertificates,
|
||||
RootCAs: meshRootCA,
|
||||
ServerName: options.AccessURL.Hostname(),
|
||||
}
|
||||
var err error
|
||||
api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, &replicasync.Options{
|
||||
RelayAddress: options.DERPServerRelayAddress,
|
||||
RegionID: int32(options.DERPServerRegionID),
|
||||
TLSConfig: meshTLSConfig,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("initialize replica: %w", err)
|
||||
}
|
||||
api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, meshTLSConfig)
|
||||
|
||||
err = api.updateEntitlements(ctx)
|
||||
if err != nil {
|
||||
return nil, xerrors.Errorf("update entitlements: %w", err)
|
||||
}
|
||||
@ -129,13 +173,17 @@ func New(ctx context.Context, options *Options) (*API, error) {
|
||||
type Options struct {
|
||||
*coderd.Options
|
||||
|
||||
RBACEnabled bool
|
||||
RBAC bool
|
||||
AuditLogging bool
|
||||
// Whether to block non-browser connections.
|
||||
BrowserOnly bool
|
||||
SCIMAPIKey []byte
|
||||
UserWorkspaceQuota int
|
||||
|
||||
// Used for high availability.
|
||||
DERPServerRelayAddress string
|
||||
DERPServerRegionID int
|
||||
|
||||
EntitlementsUpdateInterval time.Duration
|
||||
Keys map[string]ed25519.PublicKey
|
||||
}
|
||||
@ -144,6 +192,11 @@ type API struct {
|
||||
AGPL *coderd.API
|
||||
*Options
|
||||
|
||||
// Detects multiple Coder replicas running at the same time.
|
||||
replicaManager *replicasync.Manager
|
||||
// Meshes DERP connections from multiple replicas.
|
||||
derpMesh *derpmesh.Mesh
|
||||
|
||||
cancelEntitlementsLoop func()
|
||||
entitlementsMu sync.RWMutex
|
||||
entitlements codersdk.Entitlements
|
||||
@ -151,6 +204,8 @@ type API struct {
|
||||
|
||||
func (api *API) Close() error {
|
||||
api.cancelEntitlementsLoop()
|
||||
_ = api.replicaManager.Close()
|
||||
_ = api.derpMesh.Close()
|
||||
return api.AGPL.Close()
|
||||
}
|
||||
|
||||
@ -158,12 +213,13 @@ func (api *API) updateEntitlements(ctx context.Context) error {
|
||||
api.entitlementsMu.Lock()
|
||||
defer api.entitlementsMu.Unlock()
|
||||
|
||||
entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, api.Keys, map[string]bool{
|
||||
codersdk.FeatureAuditLog: api.AuditLogging,
|
||||
codersdk.FeatureBrowserOnly: api.BrowserOnly,
|
||||
codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0,
|
||||
codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0,
|
||||
codersdk.FeatureTemplateRBAC: api.RBACEnabled,
|
||||
entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, len(api.replicaManager.All()), api.Keys, map[string]bool{
|
||||
codersdk.FeatureAuditLog: api.AuditLogging,
|
||||
codersdk.FeatureBrowserOnly: api.BrowserOnly,
|
||||
codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0,
|
||||
codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0,
|
||||
codersdk.FeatureHighAvailability: api.DERPServerRelayAddress != "",
|
||||
codersdk.FeatureTemplateRBAC: api.RBAC,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
@ -209,6 +265,46 @@ func (api *API) updateEntitlements(ctx context.Context) error {
|
||||
api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer)
|
||||
}
|
||||
|
||||
if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed {
|
||||
coordinator := agpltailnet.NewCoordinator()
|
||||
if enabled {
|
||||
haCoordinator, err := tailnet.NewCoordinator(api.Logger, api.Pubsub)
|
||||
if err != nil {
|
||||
api.Logger.Error(ctx, "unable to set up high availability coordinator", slog.Error(err))
|
||||
// If we try to setup the HA coordinator and it fails, nothing
|
||||
// is actually changing.
|
||||
changed = false
|
||||
} else {
|
||||
coordinator = haCoordinator
|
||||
}
|
||||
|
||||
api.replicaManager.SetCallback(func() {
|
||||
addresses := make([]string, 0)
|
||||
for _, replica := range api.replicaManager.Regional() {
|
||||
addresses = append(addresses, replica.RelayAddress)
|
||||
}
|
||||
api.derpMesh.SetAddresses(addresses, false)
|
||||
_ = api.updateEntitlements(ctx)
|
||||
})
|
||||
} else {
|
||||
api.derpMesh.SetAddresses([]string{}, false)
|
||||
api.replicaManager.SetCallback(func() {
|
||||
// If the amount of replicas change, so should our entitlements.
|
||||
// This is to display a warning in the UI if the user is unlicensed.
|
||||
_ = api.updateEntitlements(ctx)
|
||||
})
|
||||
}
|
||||
|
||||
// Recheck changed in case the HA coordinator failed to set up.
|
||||
if changed {
|
||||
oldCoordinator := *api.AGPL.TailnetCoordinator.Swap(&coordinator)
|
||||
err := oldCoordinator.Close()
|
||||
if err != nil {
|
||||
api.Logger.Error(ctx, "close old tailnet coordinator", slog.Error(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
api.entitlements = entitlements
|
||||
|
||||
return nil
|
||||
|
Reference in New Issue
Block a user