fix(healthcheck): fix spamming notifications for unhealthy services

- Added a new `downNotificationSent` flag to track if a service down notification has been sent.
- Reset the notification state when a service comes back up.
- Updated logic to ensure notifications are sent only once after reaching the configured retry threshold for consecutive failures.
This commit is contained in:
yusing 2025-07-27 21:47:29 +08:00
parent 68ac4f952d
commit 475e697490

View file

@ -35,8 +35,9 @@ type (
isZeroPort bool isZeroPort bool
notifyFunc notif.NotifyFunc notifyFunc notif.NotifyFunc
numConsecFailures atomic.Int64 numConsecFailures atomic.Int64
downNotificationSent atomic.Bool
task *task.Task task *task.Task
} }
@ -272,16 +273,21 @@ func (mon *monitor) checkUpdateHealth() error {
if result.Healthy { if result.Healthy {
mon.notifyServiceUp(&logger, result) mon.notifyServiceUp(&logger, result)
mon.numConsecFailures.Store(0) mon.numConsecFailures.Store(0)
mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
} else if mon.config.Retries < 0 { } else if mon.config.Retries < 0 {
// immediate or meet the threshold // immediate notification when retries < 0
mon.notifyServiceDown(&logger, result) mon.notifyServiceDown(&logger, result)
mon.downNotificationSent.Store(true)
} }
} }
// if threshold > 0, notify after threshold consecutive failures // if threshold >= 0, notify after threshold consecutive failures (but only once)
if !result.Healthy && mon.config.Retries >= 0 && mon.numConsecFailures.Add(1) >= mon.config.Retries { if !result.Healthy && mon.config.Retries >= 0 {
mon.numConsecFailures.Store(0) failureCount := mon.numConsecFailures.Add(1)
mon.notifyServiceDown(&logger, result) if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
mon.notifyServiceDown(&logger, result)
mon.downNotificationSent.Store(true)
}
} }
return err return err