fix: fix PG Coordinator to update when heartbeats (re)start (#8178)

* fix: fix PG Coordinator to update when heartbeats (re)start

Signed-off-by: Spike Curtis <spike@coder.com>

* rename resetExpiryTimer(WithLock)

Signed-off-by: Spike Curtis <spike@coder.com>

---------

Signed-off-by: Spike Curtis <spike@coder.com>
This commit is contained in:
Spike Curtis
2023-06-23 14:38:58 +04:00
committed by GitHub
parent ba9d038d42
commit 5d48122f12
2 changed files with 59 additions and 9 deletions

View File

@@ -1137,7 +1137,13 @@ func (h *heartbeats) recvBeat(id uuid.UUID) {
h.logger.Debug(h.ctx, "got heartbeat", slog.F("other_coordinator_id", id))
h.lock.Lock()
defer h.lock.Unlock()
var oldestTime time.Time
if _, ok := h.coordinators[id]; !ok {
h.logger.Info(h.ctx, "heartbeats (re)started", slog.F("other_coordinator_id", id))
// send on a separate goroutine to avoid holding lock. Triggering update can be async
go func() {
_ = sendCtx(h.ctx, h.update, struct{}{})
}()
}
h.coordinators[id] = time.Now()
if h.timer == nil {
@@ -1146,7 +1152,11 @@ func (h *heartbeats) recvBeat(id uuid.UUID) {
h.logger.Debug(h.ctx, "set initial heartbeat timeout")
return
}
h.resetExpiryTimerWithLock()
}
func (h *heartbeats) resetExpiryTimerWithLock() {
var oldestTime time.Time
for _, t := range h.coordinators {
if oldestTime.IsZero() || t.Before(oldestTime) {
oldestTime = t
@@ -1163,6 +1173,7 @@ func (h *heartbeats) recvBeat(id uuid.UUID) {
func (h *heartbeats) checkExpiry() {
h.logger.Debug(h.ctx, "checking heartbeat expiry")
h.lock.Lock()
defer h.lock.Unlock()
now := time.Now()
expired := false
for id, t := range h.coordinators {
@@ -1174,10 +1185,14 @@ func (h *heartbeats) checkExpiry() {
h.logger.Info(h.ctx, "coordinator failed heartbeat check", slog.F("other_coordinator_id", id), slog.F("last_heartbeat", lastHB))
}
}
h.lock.Unlock()
if expired {
_ = sendCtx(h.ctx, h.update, struct{}{})
// send on a separate goroutine to avoid holding lock. Triggering update can be async
go func() {
_ = sendCtx(h.ctx, h.update, struct{}{})
}()
}
// we need to reset the timer for when the next oldest coordinator will expire, if any.
h.resetExpiryTimerWithLock()
}
func (h *heartbeats) sendBeats() {

View File

@@ -211,24 +211,59 @@ func TestPGCoordinatorSingle_MissedHeartbeats(t *testing.T) {
// simulate a second coordinator via DB calls only --- our goal is to test broken heart-beating, so we can't use a
// real coordinator
fCoord := &fakeCoordinator{
fCoord2 := &fakeCoordinator{
ctx: ctx,
t: t,
store: store,
id: uuid.New(),
}
// heatbeat until canceled
ctx2, cancel2 := context.WithCancel(ctx)
go func() {
t := time.NewTicker(tailnet.HeartbeatPeriod)
defer t.Stop()
for {
select {
case <-ctx2.Done():
return
case <-t.C:
fCoord2.heartbeat()
}
}
}()
fCoord2.heartbeat()
fCoord2.agentNode(agent.id, &agpl.Node{PreferredDERP: 12})
nodes = client.recvNodes(ctx, t)
assertHasDERPs(t, nodes, 12)
fCoord3 := &fakeCoordinator{
ctx: ctx,
t: t,
store: store,
id: uuid.New(),
}
start := time.Now()
fCoord.heartbeat()
fCoord.agentNode(agent.id, &agpl.Node{PreferredDERP: 12})
fCoord3.heartbeat()
fCoord3.agentNode(agent.id, &agpl.Node{PreferredDERP: 13})
nodes = client.recvNodes(ctx, t)
assertHasDERPs(t, nodes, 12)
assertHasDERPs(t, nodes, 13)
// when the fake coordinator misses enough heartbeats, the real coordinator should send an update with the old
// node for the agent.
// when the fCoord3 misses enough heartbeats, the real coordinator should send an update with the
// node from fCoord2 for the agent.
nodes = client.recvNodes(ctx, t)
assert.Greater(t, time.Since(start), tailnet.HeartbeatPeriod*tailnet.MissedHeartbeats)
assertHasDERPs(t, nodes, 12)
// stop fCoord2 heartbeats, which should cause us to revert to the original agent mapping
cancel2()
nodes = client.recvNodes(ctx, t)
assertHasDERPs(t, nodes, 10)
// send fCoord3 heartbeat, which should trigger us to consider that mapping valid again.
fCoord3.heartbeat()
nodes = client.recvNodes(ctx, t)
assertHasDERPs(t, nodes, 13)
err = agent.close()
require.NoError(t, err)
_ = agent.recvErr(ctx, t)