fix(healthcheck): fix spamming notifications for unhealthy services

- Added a new `downNotificationSent` flag to track if a service down notification has been sent. - Reset the notification state when a service comes back up. - Updated logic to ensure notifications are sent only once after reaching the configured retry threshold for consecutive failures.
2025-07-28 14:43:16 +02:00 · 2025-07-27 21:47:29 +08:00 · 2025-07-27 21:47:29 +08:00 · 475e697490
commit 475e697490
parent 68ac4f952d
1 changed files with 13 additions and 7 deletions
--- a/internal/watcher/health/monitor/monitor.go
+++ b/internal/watcher/health/monitor/monitor.go
@ -35,8 +35,9 @@ type (
 		isZeroPort bool
-		notifyFunc        notif.NotifyFunc
+		notifyFunc           notif.NotifyFunc
-		numConsecFailures atomic.Int64
+		numConsecFailures    atomic.Int64
 		downNotificationSent atomic.Bool
 		task *task.Task
 	}
@ -272,16 +273,21 @@ func (mon *monitor) checkUpdateHealth() error {
 		if result.Healthy {
 			mon.notifyServiceUp(&logger, result)
 			mon.numConsecFailures.Store(0)
 			mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
 		} else if mon.config.Retries < 0 {
-			// immediate or meet the threshold
+			// immediate notification when retries < 0
 			mon.notifyServiceDown(&logger, result)
 			mon.downNotificationSent.Store(true)
 		}
 	}
-	// if threshold > 0, notify after threshold consecutive failures
+	// if threshold >= 0, notify after threshold consecutive failures (but only once)
-	if !result.Healthy && mon.config.Retries >= 0 && mon.numConsecFailures.Add(1) >= mon.config.Retries {
+	if !result.Healthy && mon.config.Retries >= 0 {
-		mon.numConsecFailures.Store(0)
+		failureCount := mon.numConsecFailures.Add(1)
-		mon.notifyServiceDown(&logger, result)
+		if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
 			mon.notifyServiceDown(&logger, result)
 			mon.downNotificationSent.Store(true)
 		}
 	}
 	return err