From 7fe03be73f7fbd2a58ce958b24f14072d33fd171 Mon Sep 17 00:00:00 2001 From: yusing Date: Thu, 9 Jan 2025 02:53:04 +0800 Subject: [PATCH] fix: cert renewal failure cause scheduler stuck forver --- internal/autocert/provider.go | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/internal/autocert/provider.go b/internal/autocert/provider.go index 2c45aa2..deb010e 100644 --- a/internal/autocert/provider.go +++ b/internal/autocert/provider.go @@ -7,6 +7,7 @@ import ( "os" "path" "reflect" + "runtime" "sort" "time" @@ -148,28 +149,40 @@ func (p *Provider) ShouldRenewOn() time.Time { panic("no certificate available") } -func (p *Provider) ScheduleRenewal() { +func (p *Provider) ScheduleRenewal(parent task.Parent) { if p.GetName() == ProviderLocal { return } go func() { - task := task.RootTask("cert-renew-scheduler", true) + lastErrOn := time.Time{} + renewalTime := p.ShouldRenewOn() + timer := time.NewTimer(time.Until(renewalTime)) + defer timer.Stop() + + task := parent.Subtask("cert-renew-scheduler") defer task.Finish(nil) for { - renewalTime := p.ShouldRenewOn() - timer := time.NewTimer(time.Until(renewalTime)) - select { case <-task.Context().Done(): - timer.Stop() return case <-timer.C: + // Retry after 1 hour on failure + if time.Now().Before(lastErrOn.Add(time.Hour)) { + continue + } if err := p.renewIfNeeded(); err != nil { E.LogWarn("cert renew failed", err, &logger) - // Retry after 1 hour on failure - time.Sleep(time.Hour) + lastErrOn = time.Now() + continue } + // Reset on success + lastErrOn = time.Time{} + renewalTime = p.ShouldRenewOn() + timer.Reset(time.Until(renewalTime)) + default: + // Allow other tasks to run + runtime.Gosched() } } }()