fix(suites): Load persisted triggered alerts for suite endpoints on start (#1347)
This commit is contained in:
27
main.go
27
main.go
@@ -183,6 +183,33 @@ func initializeStorage(cfg *config.Config) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Load persisted triggered alerts for suite endpoints
|
||||
for _, suite := range cfg.Suites {
|
||||
for _, ep := range suite.Endpoints {
|
||||
var checksums []string
|
||||
for _, alert := range ep.Alerts {
|
||||
if alert.IsEnabled() {
|
||||
checksums = append(checksums, alert.Checksum())
|
||||
}
|
||||
}
|
||||
numberOfTriggeredAlertsDeleted := store.Get().DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep, checksums)
|
||||
if numberOfTriggeredAlertsDeleted > 0 {
|
||||
logr.Debugf("[main.initializeStorage] Deleted %d triggered alerts for suite endpoint with key=%s because their configurations have been changed or deleted", numberOfTriggeredAlertsDeleted, ep.Key())
|
||||
}
|
||||
for _, alert := range ep.Alerts {
|
||||
exists, resolveKey, numberOfSuccessesInARow, err := store.Get().GetTriggeredEndpointAlert(ep, alert)
|
||||
if err != nil {
|
||||
logr.Errorf("[main.initializeStorage] Failed to get triggered alert for suite endpoint with key=%s: %s", ep.Key(), err.Error())
|
||||
continue
|
||||
}
|
||||
if exists {
|
||||
alert.Triggered, alert.ResolveKey = true, resolveKey
|
||||
ep.NumberOfSuccessesInARow, ep.NumberOfFailuresInARow = numberOfSuccessesInARow, alert.FailureThreshold
|
||||
numberOfPersistedTriggeredAlertsLoaded++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if numberOfPersistedTriggeredAlertsLoaded > 0 {
|
||||
logr.Infof("[main.initializeStorage] Loaded %d persisted triggered alerts", numberOfPersistedTriggeredAlertsLoaded)
|
||||
}
|
||||
|
||||
@@ -64,7 +64,6 @@ func executeEndpoint(ep *endpoint.Endpoint, cfg *config.Config, extraLabels []st
|
||||
}
|
||||
}
|
||||
if !cfg.Maintenance.IsUnderMaintenance() && !inEndpointMaintenanceWindow {
|
||||
// TODO: Consider moving this after the monitoring lock is unlocked? I mean, how much noise can a single alerting provider cause...
|
||||
HandleAlerting(ep, result, cfg.Alerting)
|
||||
} else {
|
||||
logr.Debug("[watchdog.executeEndpoint] Not handling alerting because currently in the maintenance window")
|
||||
|
||||
@@ -50,6 +50,8 @@ func executeSuite(s *suite.Suite, cfg *config.Config, extraLabels []string) {
|
||||
if cfg.Metrics {
|
||||
metrics.PublishMetricsForSuite(s, result, extraLabels)
|
||||
}
|
||||
// Store result
|
||||
UpdateSuiteStatus(s, result)
|
||||
// Handle alerting for suite endpoints
|
||||
for i, ep := range s.Endpoints {
|
||||
if i < len(result.EndpointResults) {
|
||||
@@ -72,8 +74,6 @@ func executeSuite(s *suite.Suite, cfg *config.Config, extraLabels []string) {
|
||||
}
|
||||
}
|
||||
logr.Infof("[watchdog.executeSuite] Completed suite=%s; success=%v; errors=%d; duration=%v; endpoints_executed=%d/%d", s.Name, result.Success, len(result.Errors), result.Duration, len(result.EndpointResults), len(s.Endpoints))
|
||||
// Store result in database
|
||||
UpdateSuiteStatus(s, result)
|
||||
}
|
||||
|
||||
// UpdateSuiteStatus persists the suite result in the database
|
||||
|
||||
Reference in New Issue
Block a user