diff --git a/main.go b/main.go index 6643d819..13788fd1 100644 --- a/main.go +++ b/main.go @@ -183,6 +183,33 @@ func initializeStorage(cfg *config.Config) { } } } + // Load persisted triggered alerts for suite endpoints + for _, suite := range cfg.Suites { + for _, ep := range suite.Endpoints { + var checksums []string + for _, alert := range ep.Alerts { + if alert.IsEnabled() { + checksums = append(checksums, alert.Checksum()) + } + } + numberOfTriggeredAlertsDeleted := store.Get().DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep, checksums) + if numberOfTriggeredAlertsDeleted > 0 { + logr.Debugf("[main.initializeStorage] Deleted %d triggered alerts for suite endpoint with key=%s because their configurations have been changed or deleted", numberOfTriggeredAlertsDeleted, ep.Key()) + } + for _, alert := range ep.Alerts { + exists, resolveKey, numberOfSuccessesInARow, err := store.Get().GetTriggeredEndpointAlert(ep, alert) + if err != nil { + logr.Errorf("[main.initializeStorage] Failed to get triggered alert for suite endpoint with key=%s: %s", ep.Key(), err.Error()) + continue + } + if exists { + alert.Triggered, alert.ResolveKey = true, resolveKey + ep.NumberOfSuccessesInARow, ep.NumberOfFailuresInARow = numberOfSuccessesInARow, alert.FailureThreshold + numberOfPersistedTriggeredAlertsLoaded++ + } + } + } + } if numberOfPersistedTriggeredAlertsLoaded > 0 { logr.Infof("[main.initializeStorage] Loaded %d persisted triggered alerts", numberOfPersistedTriggeredAlertsLoaded) } diff --git a/watchdog/endpoint.go b/watchdog/endpoint.go index 0b1e7c23..9c89bafa 100644 --- a/watchdog/endpoint.go +++ b/watchdog/endpoint.go @@ -64,7 +64,6 @@ func executeEndpoint(ep *endpoint.Endpoint, cfg *config.Config, extraLabels []st } } if !cfg.Maintenance.IsUnderMaintenance() && !inEndpointMaintenanceWindow { - // TODO: Consider moving this after the monitoring lock is unlocked? I mean, how much noise can a single alerting provider cause... HandleAlerting(ep, result, cfg.Alerting) } else { logr.Debug("[watchdog.executeEndpoint] Not handling alerting because currently in the maintenance window") diff --git a/watchdog/suite.go b/watchdog/suite.go index 18df23b6..39d2f0c6 100644 --- a/watchdog/suite.go +++ b/watchdog/suite.go @@ -50,6 +50,8 @@ func executeSuite(s *suite.Suite, cfg *config.Config, extraLabels []string) { if cfg.Metrics { metrics.PublishMetricsForSuite(s, result, extraLabels) } + // Store result + UpdateSuiteStatus(s, result) // Handle alerting for suite endpoints for i, ep := range s.Endpoints { if i < len(result.EndpointResults) { @@ -72,8 +74,6 @@ func executeSuite(s *suite.Suite, cfg *config.Config, extraLabels []string) { } } logr.Infof("[watchdog.executeSuite] Completed suite=%s; success=%v; errors=%d; duration=%v; endpoints_executed=%d/%d", s.Name, result.Success, len(result.Errors), result.Duration, len(result.EndpointResults), len(s.Endpoints)) - // Store result in database - UpdateSuiteStatus(s, result) } // UpdateSuiteStatus persists the suite result in the database