feat: Add high availability for multiple replicas (#4555)

* feat: HA tailnet coordinator

* fixup! feat: HA tailnet coordinator

* fixup! feat: HA tailnet coordinator

* remove printlns

* close all connections on coordinator

* impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* Add replicas

* Add DERP meshing to arbitrary addresses

* Move packages to highavailability folder

* Move coordinator to high availability package

* Add flags for HA

* Rename to replicasync

* Denest packages for replicas

* Add test for multiple replicas

* Fix coordination test

* Add HA to the helm chart

* Rename function pointer

* Add warnings for HA

* Add the ability to block endpoints

* Add flag to disable P2P connections

* Wow, I made the tests pass

* Add replicas endpoint

* Ensure close kills replica

* Update sql

* Add database latency to high availability

* Pipe TLS to DERP mesh

* Fix DERP mesh with TLS

* Add tests for TLS

* Fix replica sync TLS

* Fix RootCA for replica meshing

* Remove ID from replicasync

* Fix getting certificates for meshing

* Remove excessive locking

* Fix linting

* Store mesh key in the database

* Fix replica key for tests

* Fix types gen

* Fix unlocking unlocked

* Fix race in tests

* Update enterprise/derpmesh/derpmesh.go

Co-authored-by: Colin Adler <colin1adler@gmail.com>

* Rename to syncReplicas

* Reuse http client

* Delete old replicas on a CRON

* Fix race condition in connection tests

* Fix linting

* Fix nil type

* Move pubsub to in-memory for twenty test

* Add comment for configuration tweaking

* Fix leak with transport

* Fix close leak in derpmesh

* Fix race when creating server

* Remove handler update

* Skip test on Windows

* Fix DERP mesh test

* Wrap HTTP handler replacement in mutex

* Fix error message for relay

* Fix API handler for normal tests

* Fix speedtest

* Fix replica resend

* Fix derpmesh send

* Ping async

* Increase wait time of template version jobd

* Fix race when closing replica sync

* Add name to client

* Log the derpmap being used

* Don't connect if DERP is empty

* Improve agent coordinator logging

* Fix lock in coordinator

* Fix relay addr

* Fix race when updating durations

* Fix client publish race

* Run pubsub loop in a queue

* Store agent nodes in order

* Fix coordinator locking

* Check for closed pipe

Co-authored-by: Colin Adler <colin1adler@gmail.com>
This commit is contained in:
Kyle Carberry
2022-10-17 08:43:30 -05:00
committed by GitHub
parent dc3519e973
commit 2ba4a62a0d
76 changed files with 3437 additions and 404 deletions

View File

@ -3,6 +3,8 @@ package coderd
import (
"context"
"crypto/ed25519"
"crypto/tls"
"crypto/x509"
"net/http"
"sync"
"time"
@ -23,6 +25,10 @@ import (
"github.com/coder/coder/enterprise/audit"
"github.com/coder/coder/enterprise/audit/backends"
"github.com/coder/coder/enterprise/coderd/license"
"github.com/coder/coder/enterprise/derpmesh"
"github.com/coder/coder/enterprise/replicasync"
"github.com/coder/coder/enterprise/tailnet"
agpltailnet "github.com/coder/coder/tailnet"
)
// New constructs an Enterprise coderd API instance.
@ -47,6 +53,7 @@ func New(ctx context.Context, options *Options) (*API, error) {
Options: options,
cancelEntitlementsLoop: cancelFunc,
}
oauthConfigs := &httpmw.OAuth2Configs{
Github: options.GithubOAuth2Config,
OIDC: options.OIDCConfig,
@ -59,6 +66,10 @@ func New(ctx context.Context, options *Options) (*API, error) {
api.AGPL.APIHandler.Group(func(r chi.Router) {
r.Get("/entitlements", api.serveEntitlements)
r.Route("/replicas", func(r chi.Router) {
r.Use(apiKeyMiddleware)
r.Get("/", api.replicas)
})
r.Route("/licenses", func(r chi.Router) {
r.Use(apiKeyMiddleware)
r.Post("/", api.postLicense)
@ -117,7 +128,40 @@ func New(ctx context.Context, options *Options) (*API, error) {
})
}
err := api.updateEntitlements(ctx)
meshRootCA := x509.NewCertPool()
for _, certificate := range options.TLSCertificates {
for _, certificatePart := range certificate.Certificate {
certificate, err := x509.ParseCertificate(certificatePart)
if err != nil {
return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err)
}
meshRootCA.AddCert(certificate)
}
}
// This TLS configuration spoofs access from the access URL hostname
// assuming that the certificates provided will cover that hostname.
//
// Replica sync and DERP meshing require accessing replicas via their
// internal IP addresses, and if TLS is configured we use the same
// certificates.
meshTLSConfig := &tls.Config{
MinVersion: tls.VersionTLS12,
Certificates: options.TLSCertificates,
RootCAs: meshRootCA,
ServerName: options.AccessURL.Hostname(),
}
var err error
api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, &replicasync.Options{
RelayAddress: options.DERPServerRelayAddress,
RegionID: int32(options.DERPServerRegionID),
TLSConfig: meshTLSConfig,
})
if err != nil {
return nil, xerrors.Errorf("initialize replica: %w", err)
}
api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, meshTLSConfig)
err = api.updateEntitlements(ctx)
if err != nil {
return nil, xerrors.Errorf("update entitlements: %w", err)
}
@ -129,13 +173,17 @@ func New(ctx context.Context, options *Options) (*API, error) {
type Options struct {
*coderd.Options
RBACEnabled bool
RBAC bool
AuditLogging bool
// Whether to block non-browser connections.
BrowserOnly bool
SCIMAPIKey []byte
UserWorkspaceQuota int
// Used for high availability.
DERPServerRelayAddress string
DERPServerRegionID int
EntitlementsUpdateInterval time.Duration
Keys map[string]ed25519.PublicKey
}
@ -144,6 +192,11 @@ type API struct {
AGPL *coderd.API
*Options
// Detects multiple Coder replicas running at the same time.
replicaManager *replicasync.Manager
// Meshes DERP connections from multiple replicas.
derpMesh *derpmesh.Mesh
cancelEntitlementsLoop func()
entitlementsMu sync.RWMutex
entitlements codersdk.Entitlements
@ -151,6 +204,8 @@ type API struct {
func (api *API) Close() error {
api.cancelEntitlementsLoop()
_ = api.replicaManager.Close()
_ = api.derpMesh.Close()
return api.AGPL.Close()
}
@ -158,12 +213,13 @@ func (api *API) updateEntitlements(ctx context.Context) error {
api.entitlementsMu.Lock()
defer api.entitlementsMu.Unlock()
entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, api.Keys, map[string]bool{
codersdk.FeatureAuditLog: api.AuditLogging,
codersdk.FeatureBrowserOnly: api.BrowserOnly,
codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0,
codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0,
codersdk.FeatureTemplateRBAC: api.RBACEnabled,
entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, len(api.replicaManager.All()), api.Keys, map[string]bool{
codersdk.FeatureAuditLog: api.AuditLogging,
codersdk.FeatureBrowserOnly: api.BrowserOnly,
codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0,
codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0,
codersdk.FeatureHighAvailability: api.DERPServerRelayAddress != "",
codersdk.FeatureTemplateRBAC: api.RBAC,
})
if err != nil {
return err
@ -209,6 +265,46 @@ func (api *API) updateEntitlements(ctx context.Context) error {
api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer)
}
if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed {
coordinator := agpltailnet.NewCoordinator()
if enabled {
haCoordinator, err := tailnet.NewCoordinator(api.Logger, api.Pubsub)
if err != nil {
api.Logger.Error(ctx, "unable to set up high availability coordinator", slog.Error(err))
// If we try to setup the HA coordinator and it fails, nothing
// is actually changing.
changed = false
} else {
coordinator = haCoordinator
}
api.replicaManager.SetCallback(func() {
addresses := make([]string, 0)
for _, replica := range api.replicaManager.Regional() {
addresses = append(addresses, replica.RelayAddress)
}
api.derpMesh.SetAddresses(addresses, false)
_ = api.updateEntitlements(ctx)
})
} else {
api.derpMesh.SetAddresses([]string{}, false)
api.replicaManager.SetCallback(func() {
// If the amount of replicas change, so should our entitlements.
// This is to display a warning in the UI if the user is unlicensed.
_ = api.updateEntitlements(ctx)
})
}
// Recheck changed in case the HA coordinator failed to set up.
if changed {
oldCoordinator := *api.AGPL.TailnetCoordinator.Swap(&coordinator)
err := oldCoordinator.Close()
if err != nil {
api.Logger.Error(ctx, "close old tailnet coordinator", slog.Error(err))
}
}
}
api.entitlements = entitlements
return nil