feat: Add high availability for multiple replicas (#4555)

* feat: HA tailnet coordinator

* fixup! feat: HA tailnet coordinator

* fixup! feat: HA tailnet coordinator

* remove printlns

* close all connections on coordinator

* impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* fixup! impelement high availability feature

* Add replicas

* Add DERP meshing to arbitrary addresses

* Move packages to highavailability folder

* Move coordinator to high availability package

* Add flags for HA

* Rename to replicasync

* Denest packages for replicas

* Add test for multiple replicas

* Fix coordination test

* Add HA to the helm chart

* Rename function pointer

* Add warnings for HA

* Add the ability to block endpoints

* Add flag to disable P2P connections

* Wow, I made the tests pass

* Add replicas endpoint

* Ensure close kills replica

* Update sql

* Add database latency to high availability

* Pipe TLS to DERP mesh

* Fix DERP mesh with TLS

* Add tests for TLS

* Fix replica sync TLS

* Fix RootCA for replica meshing

* Remove ID from replicasync

* Fix getting certificates for meshing

* Remove excessive locking

* Fix linting

* Store mesh key in the database

* Fix replica key for tests

* Fix types gen

* Fix unlocking unlocked

* Fix race in tests

* Update enterprise/derpmesh/derpmesh.go

Co-authored-by: Colin Adler <colin1adler@gmail.com>

* Rename to syncReplicas

* Reuse http client

* Delete old replicas on a CRON

* Fix race condition in connection tests

* Fix linting

* Fix nil type

* Move pubsub to in-memory for twenty test

* Add comment for configuration tweaking

* Fix leak with transport

* Fix close leak in derpmesh

* Fix race when creating server

* Remove handler update

* Skip test on Windows

* Fix DERP mesh test

* Wrap HTTP handler replacement in mutex

* Fix error message for relay

* Fix API handler for normal tests

* Fix speedtest

* Fix replica resend

* Fix derpmesh send

* Ping async

* Increase wait time of template version jobd

* Fix race when closing replica sync

* Add name to client

* Log the derpmap being used

* Don't connect if DERP is empty

* Improve agent coordinator logging

* Fix lock in coordinator

* Fix relay addr

* Fix race when updating durations

* Fix client publish race

* Run pubsub loop in a queue

* Store agent nodes in order

* Fix coordinator locking

* Check for closed pipe

Co-authored-by: Colin Adler <colin1adler@gmail.com>
This commit is contained in:
Kyle Carberry
2022-10-17 08:43:30 -05:00
committed by GitHub
parent dc3519e973
commit 2ba4a62a0d
76 changed files with 3437 additions and 404 deletions

View File

@ -7,6 +7,7 @@ import (
"net"
"net/netip"
"sync"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
@ -14,10 +15,30 @@ import (
"tailscale.com/types/key"
)
// Coordinator exchanges nodes with agents to establish connections.
// ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐
// │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│
// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘
// Coordinators have different guarantees for HA support.
type Coordinator interface {
// Node returns an in-memory node by ID.
Node(id uuid.UUID) *Node
// ServeClient accepts a WebSocket connection that wants to connect to an agent
// with the specified ID.
ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error
// ServeAgent accepts a WebSocket connection to an agent that listens to
// incoming connections and publishes node updates.
ServeAgent(conn net.Conn, id uuid.UUID) error
// Close closes the coordinator.
Close() error
}
// Node represents a node in the network.
type Node struct {
// ID is used to identify the connection.
ID tailcfg.NodeID `json:"id"`
// AsOf is the time the node was created.
AsOf time.Time `json:"as_of"`
// Key is the Wireguard public key of the node.
Key key.NodePublic `json:"key"`
// DiscoKey is used for discovery messages over DERP to establish peer-to-peer connections.
@ -75,48 +96,59 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func
}, errChan
}
// NewCoordinator constructs a new in-memory connection coordinator.
func NewCoordinator() *Coordinator {
return &Coordinator{
// NewCoordinator constructs a new in-memory connection coordinator. This
// coordinator is incompatible with multiple Coder replicas as all node data is
// in-memory.
func NewCoordinator() Coordinator {
return &coordinator{
closed: false,
nodes: map[uuid.UUID]*Node{},
agentSockets: map[uuid.UUID]net.Conn{},
agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{},
}
}
// Coordinator exchanges nodes with agents to establish connections.
// coordinator exchanges nodes with agents to establish connections entirely in-memory.
// The Enterprise implementation provides this for high-availability.
// ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐
// │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│
// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘
// This coordinator is incompatible with multiple Coder
// replicas as all node data is in-memory.
type Coordinator struct {
mutex sync.Mutex
type coordinator struct {
mutex sync.Mutex
closed bool
// Maps agent and connection IDs to a node.
// nodes maps agent and connection IDs their respective node.
nodes map[uuid.UUID]*Node
// Maps agent ID to an open socket.
// agentSockets maps agent IDs to their open websocket.
agentSockets map[uuid.UUID]net.Conn
// Maps agent ID to connection ID for sending
// new node data as it comes in!
// agentToConnectionSockets maps agent IDs to connection IDs of conns that
// are subscribed to updates for that agent.
agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn
}
// Node returns an in-memory node by ID.
func (c *Coordinator) Node(id uuid.UUID) *Node {
// If the node does not exist, nil is returned.
func (c *coordinator) Node(id uuid.UUID) *Node {
c.mutex.Lock()
defer c.mutex.Unlock()
node := c.nodes[id]
return node
return c.nodes[id]
}
// ServeClient accepts a WebSocket connection that wants to
// connect to an agent with the specified ID.
func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error {
// ServeClient accepts a WebSocket connection that wants to connect to an agent
// with the specified ID.
func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error {
c.mutex.Lock()
if c.closed {
c.mutex.Unlock()
return xerrors.New("coordinator is closed")
}
// When a new connection is requested, we update it with the latest
// node of the agent. This allows the connection to establish.
node, ok := c.nodes[agent]
c.mutex.Unlock()
if ok {
data, err := json.Marshal([]*Node{node})
if err != nil {
@ -129,6 +161,7 @@ func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID)
return xerrors.Errorf("write nodes: %w", err)
}
}
c.mutex.Lock()
connectionSockets, ok := c.agentToConnectionSockets[agent]
if !ok {
connectionSockets = map[uuid.UUID]net.Conn{}
@ -156,47 +189,62 @@ func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID)
decoder := json.NewDecoder(conn)
for {
var node Node
err := decoder.Decode(&node)
if errors.Is(err, io.EOF) {
return nil
}
err := c.handleNextClientMessage(id, agent, decoder)
if err != nil {
return xerrors.Errorf("read json: %w", err)
}
c.mutex.Lock()
// Update the node of this client in our in-memory map.
// If an agent entirely shuts down and reconnects, it
// needs to be aware of all clients attempting to
// establish connections.
c.nodes[id] = &node
agentSocket, ok := c.agentSockets[agent]
if !ok {
c.mutex.Unlock()
continue
}
c.mutex.Unlock()
// Write the new node from this client to the actively
// connected agent.
data, err := json.Marshal([]*Node{&node})
if err != nil {
c.mutex.Unlock()
return xerrors.Errorf("marshal nodes: %w", err)
}
_, err = agentSocket.Write(data)
if errors.Is(err, io.EOF) {
return nil
}
if err != nil {
return xerrors.Errorf("write json: %w", err)
if errors.Is(err, io.EOF) {
return nil
}
return xerrors.Errorf("handle next client message: %w", err)
}
}
}
func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error {
var node Node
err := decoder.Decode(&node)
if err != nil {
return xerrors.Errorf("read json: %w", err)
}
c.mutex.Lock()
// Update the node of this client in our in-memory map. If an agent entirely
// shuts down and reconnects, it needs to be aware of all clients attempting
// to establish connections.
c.nodes[id] = &node
agentSocket, ok := c.agentSockets[agent]
if !ok {
c.mutex.Unlock()
return nil
}
c.mutex.Unlock()
// Write the new node from this client to the actively connected agent.
data, err := json.Marshal([]*Node{&node})
if err != nil {
return xerrors.Errorf("marshal nodes: %w", err)
}
_, err = agentSocket.Write(data)
if err != nil {
if errors.Is(err, io.EOF) {
return nil
}
return xerrors.Errorf("write json: %w", err)
}
return nil
}
// ServeAgent accepts a WebSocket connection to an agent that
// listens to incoming connections and publishes node updates.
func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
c.mutex.Lock()
if c.closed {
c.mutex.Unlock()
return xerrors.New("coordinator is closed")
}
sockets, ok := c.agentToConnectionSockets[id]
if ok {
// Publish all nodes that want to connect to the
@ -209,16 +257,16 @@ func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
}
nodes = append(nodes, node)
}
c.mutex.Unlock()
data, err := json.Marshal(nodes)
if err != nil {
c.mutex.Unlock()
return xerrors.Errorf("marshal json: %w", err)
}
_, err = conn.Write(data)
if err != nil {
c.mutex.Unlock()
return xerrors.Errorf("write nodes: %w", err)
}
c.mutex.Lock()
}
// If an old agent socket is connected, we close it
@ -239,36 +287,84 @@ func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error {
decoder := json.NewDecoder(conn)
for {
var node Node
err := decoder.Decode(&node)
if errors.Is(err, io.EOF) {
return nil
}
err := c.handleNextAgentMessage(id, decoder)
if err != nil {
return xerrors.Errorf("read json: %w", err)
if errors.Is(err, io.EOF) {
return nil
}
return xerrors.Errorf("handle next agent message: %w", err)
}
c.mutex.Lock()
c.nodes[id] = &node
connectionSockets, ok := c.agentToConnectionSockets[id]
if !ok {
c.mutex.Unlock()
continue
}
data, err := json.Marshal([]*Node{&node})
if err != nil {
return xerrors.Errorf("marshal nodes: %w", err)
}
// Publish the new node to every listening socket.
var wg sync.WaitGroup
wg.Add(len(connectionSockets))
for _, connectionSocket := range connectionSockets {
connectionSocket := connectionSocket
}
}
func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error {
var node Node
err := decoder.Decode(&node)
if err != nil {
return xerrors.Errorf("read json: %w", err)
}
c.mutex.Lock()
c.nodes[id] = &node
connectionSockets, ok := c.agentToConnectionSockets[id]
if !ok {
c.mutex.Unlock()
return nil
}
data, err := json.Marshal([]*Node{&node})
if err != nil {
return xerrors.Errorf("marshal nodes: %w", err)
}
// Publish the new node to every listening socket.
var wg sync.WaitGroup
wg.Add(len(connectionSockets))
for _, connectionSocket := range connectionSockets {
connectionSocket := connectionSocket
go func() {
_ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second))
_, _ = connectionSocket.Write(data)
wg.Done()
}()
}
c.mutex.Unlock()
wg.Wait()
return nil
}
// Close closes all of the open connections in the coordinator and stops the
// coordinator from accepting new connections.
func (c *coordinator) Close() error {
c.mutex.Lock()
if c.closed {
return nil
}
c.closed = true
c.mutex.Unlock()
wg := sync.WaitGroup{}
wg.Add(len(c.agentSockets))
for _, socket := range c.agentSockets {
socket := socket
go func() {
_ = socket.Close()
wg.Done()
}()
}
for _, connMap := range c.agentToConnectionSockets {
wg.Add(len(connMap))
for _, socket := range connMap {
socket := socket
go func() {
_, _ = connectionSocket.Write(data)
_ = socket.Close()
wg.Done()
}()
}
c.mutex.Unlock()
wg.Wait()
}
wg.Wait()
return nil
}