From 475e6974908a7ef66d891a77fa822437f9eafbc8 Mon Sep 17 00:00:00 2001 From: yusing Date: Sun, 27 Jul 2025 21:47:29 +0800 Subject: [PATCH] fix(healthcheck): fix spamming notifications for unhealthy services - Added a new `downNotificationSent` flag to track if a service down notification has been sent. - Reset the notification state when a service comes back up. - Updated logic to ensure notifications are sent only once after reaching the configured retry threshold for consecutive failures. --- internal/watcher/health/monitor/monitor.go | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/internal/watcher/health/monitor/monitor.go b/internal/watcher/health/monitor/monitor.go index 0ce02a9..de47e26 100644 --- a/internal/watcher/health/monitor/monitor.go +++ b/internal/watcher/health/monitor/monitor.go @@ -35,8 +35,9 @@ type ( isZeroPort bool - notifyFunc notif.NotifyFunc - numConsecFailures atomic.Int64 + notifyFunc notif.NotifyFunc + numConsecFailures atomic.Int64 + downNotificationSent atomic.Bool task *task.Task } @@ -272,16 +273,21 @@ func (mon *monitor) checkUpdateHealth() error { if result.Healthy { mon.notifyServiceUp(&logger, result) mon.numConsecFailures.Store(0) + mon.downNotificationSent.Store(false) // Reset notification state when service comes back up } else if mon.config.Retries < 0 { - // immediate or meet the threshold + // immediate notification when retries < 0 mon.notifyServiceDown(&logger, result) + mon.downNotificationSent.Store(true) } } - // if threshold > 0, notify after threshold consecutive failures - if !result.Healthy && mon.config.Retries >= 0 && mon.numConsecFailures.Add(1) >= mon.config.Retries { - mon.numConsecFailures.Store(0) - mon.notifyServiceDown(&logger, result) + // if threshold >= 0, notify after threshold consecutive failures (but only once) + if !result.Healthy && mon.config.Retries >= 0 { + failureCount := mon.numConsecFailures.Add(1) + if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() { + mon.notifyServiceDown(&logger, result) + mon.downNotificationSent.Store(true) + } } return err