mirror of
https://github.com/coder/coder.git
synced 2025-07-23 21:32:07 +00:00
fix: fix PG Coordinator to update when heartbeats (re)start (#8178)
* fix: fix PG Coordinator to update when heartbeats (re)start Signed-off-by: Spike Curtis <spike@coder.com> * rename resetExpiryTimer(WithLock) Signed-off-by: Spike Curtis <spike@coder.com> --------- Signed-off-by: Spike Curtis <spike@coder.com>
This commit is contained in:
@@ -1137,7 +1137,13 @@ func (h *heartbeats) recvBeat(id uuid.UUID) {
|
||||
h.logger.Debug(h.ctx, "got heartbeat", slog.F("other_coordinator_id", id))
|
||||
h.lock.Lock()
|
||||
defer h.lock.Unlock()
|
||||
var oldestTime time.Time
|
||||
if _, ok := h.coordinators[id]; !ok {
|
||||
h.logger.Info(h.ctx, "heartbeats (re)started", slog.F("other_coordinator_id", id))
|
||||
// send on a separate goroutine to avoid holding lock. Triggering update can be async
|
||||
go func() {
|
||||
_ = sendCtx(h.ctx, h.update, struct{}{})
|
||||
}()
|
||||
}
|
||||
h.coordinators[id] = time.Now()
|
||||
|
||||
if h.timer == nil {
|
||||
@@ -1146,7 +1152,11 @@ func (h *heartbeats) recvBeat(id uuid.UUID) {
|
||||
h.logger.Debug(h.ctx, "set initial heartbeat timeout")
|
||||
return
|
||||
}
|
||||
h.resetExpiryTimerWithLock()
|
||||
}
|
||||
|
||||
func (h *heartbeats) resetExpiryTimerWithLock() {
|
||||
var oldestTime time.Time
|
||||
for _, t := range h.coordinators {
|
||||
if oldestTime.IsZero() || t.Before(oldestTime) {
|
||||
oldestTime = t
|
||||
@@ -1163,6 +1173,7 @@ func (h *heartbeats) recvBeat(id uuid.UUID) {
|
||||
func (h *heartbeats) checkExpiry() {
|
||||
h.logger.Debug(h.ctx, "checking heartbeat expiry")
|
||||
h.lock.Lock()
|
||||
defer h.lock.Unlock()
|
||||
now := time.Now()
|
||||
expired := false
|
||||
for id, t := range h.coordinators {
|
||||
@@ -1174,10 +1185,14 @@ func (h *heartbeats) checkExpiry() {
|
||||
h.logger.Info(h.ctx, "coordinator failed heartbeat check", slog.F("other_coordinator_id", id), slog.F("last_heartbeat", lastHB))
|
||||
}
|
||||
}
|
||||
h.lock.Unlock()
|
||||
if expired {
|
||||
_ = sendCtx(h.ctx, h.update, struct{}{})
|
||||
// send on a separate goroutine to avoid holding lock. Triggering update can be async
|
||||
go func() {
|
||||
_ = sendCtx(h.ctx, h.update, struct{}{})
|
||||
}()
|
||||
}
|
||||
// we need to reset the timer for when the next oldest coordinator will expire, if any.
|
||||
h.resetExpiryTimerWithLock()
|
||||
}
|
||||
|
||||
func (h *heartbeats) sendBeats() {
|
||||
|
@@ -211,24 +211,59 @@ func TestPGCoordinatorSingle_MissedHeartbeats(t *testing.T) {
|
||||
|
||||
// simulate a second coordinator via DB calls only --- our goal is to test broken heart-beating, so we can't use a
|
||||
// real coordinator
|
||||
fCoord := &fakeCoordinator{
|
||||
fCoord2 := &fakeCoordinator{
|
||||
ctx: ctx,
|
||||
t: t,
|
||||
store: store,
|
||||
id: uuid.New(),
|
||||
}
|
||||
// heatbeat until canceled
|
||||
ctx2, cancel2 := context.WithCancel(ctx)
|
||||
go func() {
|
||||
t := time.NewTicker(tailnet.HeartbeatPeriod)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx2.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
fCoord2.heartbeat()
|
||||
}
|
||||
}
|
||||
}()
|
||||
fCoord2.heartbeat()
|
||||
fCoord2.agentNode(agent.id, &agpl.Node{PreferredDERP: 12})
|
||||
nodes = client.recvNodes(ctx, t)
|
||||
assertHasDERPs(t, nodes, 12)
|
||||
|
||||
fCoord3 := &fakeCoordinator{
|
||||
ctx: ctx,
|
||||
t: t,
|
||||
store: store,
|
||||
id: uuid.New(),
|
||||
}
|
||||
start := time.Now()
|
||||
fCoord.heartbeat()
|
||||
fCoord.agentNode(agent.id, &agpl.Node{PreferredDERP: 12})
|
||||
fCoord3.heartbeat()
|
||||
fCoord3.agentNode(agent.id, &agpl.Node{PreferredDERP: 13})
|
||||
nodes = client.recvNodes(ctx, t)
|
||||
assertHasDERPs(t, nodes, 12)
|
||||
assertHasDERPs(t, nodes, 13)
|
||||
|
||||
// when the fake coordinator misses enough heartbeats, the real coordinator should send an update with the old
|
||||
// node for the agent.
|
||||
// when the fCoord3 misses enough heartbeats, the real coordinator should send an update with the
|
||||
// node from fCoord2 for the agent.
|
||||
nodes = client.recvNodes(ctx, t)
|
||||
assert.Greater(t, time.Since(start), tailnet.HeartbeatPeriod*tailnet.MissedHeartbeats)
|
||||
assertHasDERPs(t, nodes, 12)
|
||||
|
||||
// stop fCoord2 heartbeats, which should cause us to revert to the original agent mapping
|
||||
cancel2()
|
||||
nodes = client.recvNodes(ctx, t)
|
||||
assertHasDERPs(t, nodes, 10)
|
||||
|
||||
// send fCoord3 heartbeat, which should trigger us to consider that mapping valid again.
|
||||
fCoord3.heartbeat()
|
||||
nodes = client.recvNodes(ctx, t)
|
||||
assertHasDERPs(t, nodes, 13)
|
||||
|
||||
err = agent.close()
|
||||
require.NoError(t, err)
|
||||
_ = agent.recvErr(ctx, t)
|
||||
|
Reference in New Issue
Block a user