mirror of
https://github.com/coder/coder.git
synced 2025-07-15 22:20:27 +00:00
feat: Add high availability for multiple replicas (#4555)
* feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * fixup! feat: HA tailnet coordinator * remove printlns * close all connections on coordinator * impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * fixup! impelement high availability feature * Add replicas * Add DERP meshing to arbitrary addresses * Move packages to highavailability folder * Move coordinator to high availability package * Add flags for HA * Rename to replicasync * Denest packages for replicas * Add test for multiple replicas * Fix coordination test * Add HA to the helm chart * Rename function pointer * Add warnings for HA * Add the ability to block endpoints * Add flag to disable P2P connections * Wow, I made the tests pass * Add replicas endpoint * Ensure close kills replica * Update sql * Add database latency to high availability * Pipe TLS to DERP mesh * Fix DERP mesh with TLS * Add tests for TLS * Fix replica sync TLS * Fix RootCA for replica meshing * Remove ID from replicasync * Fix getting certificates for meshing * Remove excessive locking * Fix linting * Store mesh key in the database * Fix replica key for tests * Fix types gen * Fix unlocking unlocked * Fix race in tests * Update enterprise/derpmesh/derpmesh.go Co-authored-by: Colin Adler <colin1adler@gmail.com> * Rename to syncReplicas * Reuse http client * Delete old replicas on a CRON * Fix race condition in connection tests * Fix linting * Fix nil type * Move pubsub to in-memory for twenty test * Add comment for configuration tweaking * Fix leak with transport * Fix close leak in derpmesh * Fix race when creating server * Remove handler update * Skip test on Windows * Fix DERP mesh test * Wrap HTTP handler replacement in mutex * Fix error message for relay * Fix API handler for normal tests * Fix speedtest * Fix replica resend * Fix derpmesh send * Ping async * Increase wait time of template version jobd * Fix race when closing replica sync * Add name to client * Log the derpmap being used * Don't connect if DERP is empty * Improve agent coordinator logging * Fix lock in coordinator * Fix relay addr * Fix race when updating durations * Fix client publish race * Run pubsub loop in a queue * Store agent nodes in order * Fix coordinator locking * Check for closed pipe Co-authored-by: Colin Adler <colin1adler@gmail.com>
This commit is contained in:
@ -7,6 +7,7 @@ import (
|
||||
"net"
|
||||
"net/netip"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/xerrors"
|
||||
@ -14,10 +15,30 @@ import (
|
||||
"tailscale.com/types/key"
|
||||
)
|
||||
|
||||
// Coordinator exchanges nodes with agents to establish connections.
|
||||
// ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐
|
||||
// │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│
|
||||
// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘
|
||||
// Coordinators have different guarantees for HA support.
|
||||
type Coordinator interface {
|
||||
// Node returns an in-memory node by ID.
|
||||
Node(id uuid.UUID) *Node
|
||||
// ServeClient accepts a WebSocket connection that wants to connect to an agent
|
||||
// with the specified ID.
|
||||
ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error
|
||||
// ServeAgent accepts a WebSocket connection to an agent that listens to
|
||||
// incoming connections and publishes node updates.
|
||||
ServeAgent(conn net.Conn, id uuid.UUID) error
|
||||
// Close closes the coordinator.
|
||||
Close() error
|
||||
}
|
||||
|
||||
// Node represents a node in the network.
|
||||
type Node struct {
|
||||
// ID is used to identify the connection.
|
||||
ID tailcfg.NodeID `json:"id"`
|
||||
// AsOf is the time the node was created.
|
||||
AsOf time.Time `json:"as_of"`
|
||||
// Key is the Wireguard public key of the node.
|
||||
Key key.NodePublic `json:"key"`
|
||||
// DiscoKey is used for discovery messages over DERP to establish peer-to-peer connections.
|
||||
@ -75,48 +96,59 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func
|
||||
}, errChan
|
||||
}
|
||||
|
||||
// NewCoordinator constructs a new in-memory connection coordinator.
|
||||
func NewCoordinator() *Coordinator {
|
||||
return &Coordinator{
|
||||
// NewCoordinator constructs a new in-memory connection coordinator. This
|
||||
// coordinator is incompatible with multiple Coder replicas as all node data is
|
||||
// in-memory.
|
||||
func NewCoordinator() Coordinator {
|
||||
return &coordinator{
|
||||
closed: false,
|
||||
nodes: map[uuid.UUID]*Node{},
|
||||
agentSockets: map[uuid.UUID]net.Conn{},
|
||||
agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{},
|
||||
}
|
||||
}
|
||||
|
||||
// Coordinator exchanges nodes with agents to establish connections.
|
||||
// coordinator exchanges nodes with agents to establish connections entirely in-memory.
|
||||
// The Enterprise implementation provides this for high-availability.
|
||||
// ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐
|
||||
// │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│
|
||||
// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘
|
||||
// This coordinator is incompatible with multiple Coder
|
||||
// replicas as all node data is in-memory.
|
||||
type Coordinator struct {
|
||||
mutex sync.Mutex
|
||||
type coordinator struct {
|
||||
mutex sync.Mutex
|
||||
closed bool
|
||||
|
||||
// Maps agent and connection IDs to a node.
|
||||
// nodes maps agent and connection IDs their respective node.
|
||||
nodes map[uuid.UUID]*Node
|
||||
// Maps agent ID to an open socket.
|
||||
// agentSockets maps agent IDs to their open websocket.
|
||||
agentSockets map[uuid.UUID]net.Conn
|
||||
// Maps agent ID to connection ID for sending
|
||||
// new node data as it comes in!
|
||||
// agentToConnectionSockets maps agent IDs to connection IDs of conns that
|
||||
// are subscribed to updates for that agent.
|
||||
agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn
|
||||
}
|
||||
|
||||
// Node returns an in-memory node by ID.
|
||||
func (c *Coordinator) Node(id uuid.UUID) *Node {
|
||||
// If the node does not exist, nil is returned.
|
||||
func (c *coordinator) Node(id uuid.UUID) *Node {
|
||||
c.mutex.Lock()
|
||||
defer c.mutex.Unlock()
|
||||
node := c.nodes[id]
|
||||
return node
|
||||
return c.nodes[id]
|
||||
}
|
||||
|
||||
// ServeClient accepts a WebSocket connection that wants to
|
||||
// connect to an agent with the specified ID.
|
||||
func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error {
|
||||
// ServeClient accepts a WebSocket connection that wants to connect to an agent
|
||||
// with the specified ID.
|
||||
func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error {
|
||||
c.mutex.Lock()
|
||||
if c.closed {
|
||||
c.mutex.Unlock()
|
||||
return xerrors.New("coordinator is closed")
|
||||
}
|
||||
|
||||
// When a new connection is requested, we update it with the latest
|
||||
// node of the agent. This allows the connection to establish.
|
||||
node, ok := c.nodes[agent]
|
||||
c.mutex.Unlock()
|
||||
if ok {
|
||||
data, err := json.Marshal([]*Node{node})
|
||||
if err != nil {
|
||||
@ -129,6 +161,7 @@ func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID)
|
||||
return xerrors.Errorf("write nodes: %w", err)
|
||||
}
|
||||
}
|
||||
c.mutex.Lock()
|
||||
connectionSockets, ok := c.agentToConnectionSockets[agent]
|
||||
if !ok {
|
||||
connectionSockets = map[uuid.UUID]net.Conn{}
|
||||
@ -156,47 +189,62 @@ func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID)
|
||||
|
||||
decoder := json.NewDecoder(conn)
|
||||
for {
|
||||
var node Node
|
||||
err := decoder.Decode(&node)
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
err := c.handleNextClientMessage(id, agent, decoder)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("read json: %w", err)
|
||||
}
|
||||
c.mutex.Lock()
|
||||
// Update the node of this client in our in-memory map.
|
||||
// If an agent entirely shuts down and reconnects, it
|
||||
// needs to be aware of all clients attempting to
|
||||
// establish connections.
|
||||
c.nodes[id] = &node
|
||||
agentSocket, ok := c.agentSockets[agent]
|
||||
if !ok {
|
||||
c.mutex.Unlock()
|
||||
continue
|
||||
}
|
||||
c.mutex.Unlock()
|
||||
// Write the new node from this client to the actively
|
||||
// connected agent.
|
||||
data, err := json.Marshal([]*Node{&node})
|
||||
if err != nil {
|
||||
c.mutex.Unlock()
|
||||
return xerrors.Errorf("marshal nodes: %w", err)
|
||||
}
|
||||
_, err = agentSocket.Write(data)
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
return xerrors.Errorf("write json: %w", err)
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
return xerrors.Errorf("handle next client message: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error {
|
||||
var node Node
|
||||
err := decoder.Decode(&node)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("read json: %w", err)
|
||||
}
|
||||
|
||||
c.mutex.Lock()
|
||||
// Update the node of this client in our in-memory map. If an agent entirely
|
||||
// shuts down and reconnects, it needs to be aware of all clients attempting
|
||||
// to establish connections.
|
||||
c.nodes[id] = &node
|
||||
|
||||
agentSocket, ok := c.agentSockets[agent]
|
||||
if !ok {
|
||||
c.mutex.Unlock()
|
||||
return nil
|
||||
}
|
||||
c.mutex.Unlock()
|
||||
|
||||
// Write the new node from this client to the actively connected agent.
|
||||
data, err := json.Marshal([]*Node{&node})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("marshal nodes: %w", err)
|
||||
}
|
||||
|
||||
_, err = agentSocket.Write(data)
|
||||
if err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
return xerrors.Errorf("write json: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ServeAgent accepts a WebSocket connection to an agent that
|
||||
// listens to incoming connections and publishes node updates.
|
||||
func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
|
||||
func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
|
||||
c.mutex.Lock()
|
||||
if c.closed {
|
||||
c.mutex.Unlock()
|
||||
return xerrors.New("coordinator is closed")
|
||||
}
|
||||
|
||||
sockets, ok := c.agentToConnectionSockets[id]
|
||||
if ok {
|
||||
// Publish all nodes that want to connect to the
|
||||
@ -209,16 +257,16 @@ func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
|
||||
}
|
||||
nodes = append(nodes, node)
|
||||
}
|
||||
c.mutex.Unlock()
|
||||
data, err := json.Marshal(nodes)
|
||||
if err != nil {
|
||||
c.mutex.Unlock()
|
||||
return xerrors.Errorf("marshal json: %w", err)
|
||||
}
|
||||
_, err = conn.Write(data)
|
||||
if err != nil {
|
||||
c.mutex.Unlock()
|
||||
return xerrors.Errorf("write nodes: %w", err)
|
||||
}
|
||||
c.mutex.Lock()
|
||||
}
|
||||
|
||||
// If an old agent socket is connected, we close it
|
||||
@ -239,36 +287,84 @@ func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
|
||||
|
||||
decoder := json.NewDecoder(conn)
|
||||
for {
|
||||
var node Node
|
||||
err := decoder.Decode(&node)
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
err := c.handleNextAgentMessage(id, decoder)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("read json: %w", err)
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
return xerrors.Errorf("handle next agent message: %w", err)
|
||||
}
|
||||
c.mutex.Lock()
|
||||
c.nodes[id] = &node
|
||||
connectionSockets, ok := c.agentToConnectionSockets[id]
|
||||
if !ok {
|
||||
c.mutex.Unlock()
|
||||
continue
|
||||
}
|
||||
data, err := json.Marshal([]*Node{&node})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("marshal nodes: %w", err)
|
||||
}
|
||||
// Publish the new node to every listening socket.
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(connectionSockets))
|
||||
for _, connectionSocket := range connectionSockets {
|
||||
connectionSocket := connectionSocket
|
||||
}
|
||||
}
|
||||
|
||||
func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error {
|
||||
var node Node
|
||||
err := decoder.Decode(&node)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("read json: %w", err)
|
||||
}
|
||||
|
||||
c.mutex.Lock()
|
||||
c.nodes[id] = &node
|
||||
connectionSockets, ok := c.agentToConnectionSockets[id]
|
||||
if !ok {
|
||||
c.mutex.Unlock()
|
||||
return nil
|
||||
}
|
||||
data, err := json.Marshal([]*Node{&node})
|
||||
if err != nil {
|
||||
return xerrors.Errorf("marshal nodes: %w", err)
|
||||
}
|
||||
|
||||
// Publish the new node to every listening socket.
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(connectionSockets))
|
||||
for _, connectionSocket := range connectionSockets {
|
||||
connectionSocket := connectionSocket
|
||||
go func() {
|
||||
_ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second))
|
||||
_, _ = connectionSocket.Write(data)
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
c.mutex.Unlock()
|
||||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close closes all of the open connections in the coordinator and stops the
|
||||
// coordinator from accepting new connections.
|
||||
func (c *coordinator) Close() error {
|
||||
c.mutex.Lock()
|
||||
if c.closed {
|
||||
return nil
|
||||
}
|
||||
c.closed = true
|
||||
c.mutex.Unlock()
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
wg.Add(len(c.agentSockets))
|
||||
for _, socket := range c.agentSockets {
|
||||
socket := socket
|
||||
go func() {
|
||||
_ = socket.Close()
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
|
||||
for _, connMap := range c.agentToConnectionSockets {
|
||||
wg.Add(len(connMap))
|
||||
for _, socket := range connMap {
|
||||
socket := socket
|
||||
go func() {
|
||||
_, _ = connectionSocket.Write(data)
|
||||
_ = socket.Close()
|
||||
wg.Done()
|
||||
}()
|
||||
}
|
||||
c.mutex.Unlock()
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user