fix: fix race condition in pubsub startup (#17088)

fixes https://github.com/coder/internal/issues/525

If the context is canceled, the goroutine that is supposed to read from the `errCh` could exit prematurely, leading to a goroutine leak. Refactors this code so it cannot block.
This commit is contained in:
Spike Curtis
2025-03-25 16:45:45 +04:00
committed by GitHub
parent 4c33846f6d
commit 5f3a53f01b

View File

@ -492,7 +492,6 @@ func (p *PGPubsub) startListener(ctx context.Context, connectURL string) error {
p.connected.Set(0)
// Creates a new listener using pq.
var (
errCh = make(chan error)
dialer = logDialer{
logger: p.logger,
// pq.defaultDialer uses a zero net.Dialer as well.
@ -525,6 +524,10 @@ func (p *PGPubsub) startListener(ctx context.Context, connectURL string) error {
dc.Dialer(dialer)
}
var (
errCh = make(chan error, 1)
sentErrCh = false
)
p.pgListener = pqListenerShim{
Listener: pq.NewConnectorListener(connector, connectURL, time.Second, time.Minute, func(t pq.ListenerEventType, err error) {
switch t {
@ -541,18 +544,16 @@ func (p *PGPubsub) startListener(ctx context.Context, connectURL string) error {
p.logger.Error(ctx, "pubsub failed to connect to postgres", slog.Error(err))
}
// This callback gets events whenever the connection state changes.
// Don't send if the errChannel has already been closed.
select {
case <-errCh:
// Only send the first error.
if sentErrCh {
return
default:
errCh <- err
close(errCh)
}
errCh <- err // won't block because we are buffered.
sentErrCh = true
}),
}
select {
case err := <-errCh:
case err = <-errCh:
if err != nil {
_ = p.pgListener.Close()
return xerrors.Errorf("create pq listener: %w", err)