diff --git a/README.md b/README.md index 71af2b0f..9343e101 100644 --- a/README.md +++ b/README.md @@ -553,16 +553,17 @@ individual endpoints with configurable descriptions and thresholds. Alerts are configured at the endpoint level like so: -| Parameter | Description | Default | -|:-----------------------------|:-------------------------------------------------------------------------------|:--------------| -| `alerts` | List of all alerts for a given endpoint. | `[]` | -| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` | -| `alerts[].enabled` | Whether to enable the alert. | `true` | -| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` | -| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | -| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | -| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | -| `alerts[].provider-override` | Alerting provider configuration override for the given alert type | `{}` | +| Parameter | Description | Default | +|:-------------------------------------|:-------------------------------------------------------------------------------|:--------------| +| `alerts` | List of all alerts for a given endpoint. | `[]` | +| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` | +| `alerts[].enabled` | Whether to enable the alert. | `true` | +| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` | +| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | +| `alerts[].minimum-reminder-interval` | Configuration for setting an interval between reminders. | `""` | +| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | +| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | +| `alerts[].provider-override` | Alerting provider configuration override for the given alert type | `{}` | Here's an example of what an alert configuration might look like at the endpoint level: ```yaml diff --git a/alerting/alert/alert.go b/alerting/alert/alert.go index 52eb5f5d..ebd34c19 100644 --- a/alerting/alert/alert.go +++ b/alerting/alert/alert.go @@ -6,6 +6,7 @@ import ( "errors" "strconv" "strings" + "time" "github.com/TwiN/logr" "gopkg.in/yaml.v3" @@ -35,6 +36,9 @@ type Alert struct { // SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved SuccessThreshold int `yaml:"success-threshold"` + // MinimumReminderInterval is the interval between reminders + MinimumReminderInterval time.Duration `yaml:"minimum-reminder-interval,omitempty"` + // Description of the alert. Will be included in the alert sent. // // This is a pointer, because it is populated by YAML and we need to know whether it was explicitly set to a value diff --git a/alerting/provider/ilert/ilert_test.go b/alerting/provider/ilert/ilert_test.go index 3f18d60e..f5040e5e 100644 --- a/alerting/provider/ilert/ilert_test.go +++ b/alerting/provider/ilert/ilert_test.go @@ -174,21 +174,21 @@ func TestAlertProvider_BuildRequestBody(t *testing.T) { Provider: AlertProvider{DefaultConfig: Config{IntegrationKey: "some-integration-key"}}, Alert: alert.Alert{Description: &firstDescription, SuccessThreshold: 3, FailureThreshold: 3, ResolveKey: "123", Type: "ilert", SendOnResolved: &sendOnResolved}, Resolved: false, - ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":3,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`, + ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":3,"MinimumReminderInterval":0,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`, }, { Name: "resolved", Provider: AlertProvider{DefaultConfig: Config{IntegrationKey: "some-integration-key"}}, Alert: alert.Alert{Description: &firstDescription, SuccessThreshold: 4, FailureThreshold: 3, ResolveKey: "123", Type: "ilert", SendOnResolved: &sendOnResolved}, Resolved: true, - ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":4,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"resolved","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":true},{"condition":"[STATUS] == 200","success":true}],"url":""}`, + ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":4,"MinimumReminderInterval":0,"Description":"description-1","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"resolved","title":"endpoint-name","details":"description-1","condition_results":[{"condition":"[CONNECTED] == true","success":true},{"condition":"[STATUS] == 200","success":true}],"url":""}`, }, { Name: "group-override", Provider: AlertProvider{DefaultConfig: Config{IntegrationKey: "some-integration-key"}, Overrides: []Override{{Group: "g", Config: Config{IntegrationKey: "different-integration-key"}}}}, Alert: alert.Alert{Description: &secondDescription, SuccessThreshold: 5, FailureThreshold: 3, ResolveKey: "123", Type: "ilert", SendOnResolved: &sendOnResolved}, Resolved: false, - ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":5,"Description":"description-2","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-2","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`, + ExpectedBody: `{"alert":{"Type":"ilert","Enabled":null,"FailureThreshold":3,"SuccessThreshold":5,"MinimumReminderInterval":0,"Description":"description-2","SendOnResolved":true,"ProviderOverride":null,"ResolveKey":"123","Triggered":false},"name":"endpoint-name","group":"","status":"firing","title":"endpoint-name","details":"description-2","condition_results":[{"condition":"[CONNECTED] == true","success":false},{"condition":"[STATUS] == 200","success":false}],"url":""}`, }, } diff --git a/config/endpoint/endpoint.go b/config/endpoint/endpoint.go index fd1bc305..5153a45b 100644 --- a/config/endpoint/endpoint.go +++ b/config/endpoint/endpoint.go @@ -131,6 +131,9 @@ type Endpoint struct { // NumberOfSuccessesInARow is the number of successful evaluations in a row NumberOfSuccessesInARow int `yaml:"-"` + + // LastReminderSent is the time at which the last reminder was sent for this endpoint. + LastReminderSent time.Time `yaml:"-"` } // IsEnabled returns whether the endpoint is enabled or not diff --git a/watchdog/alerting.go b/watchdog/alerting.go index 866ed667..4e09477d 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -2,7 +2,9 @@ package watchdog import ( "errors" + "log" "os" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/config/endpoint" @@ -30,14 +32,24 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > ep.NumberOfFailuresInARow { continue } - if endpointAlert.Triggered { - logr.Debugf("[watchdog.handleAlertsToTrigger] Alert for endpoint with key=%s with description='%s' has already been TRIGGERED, skipping", ep.Key(), endpointAlert.GetDescription()) + // Determine if an initial alert should be sent + sendInitialAlert := !endpointAlert.Triggered + // Determine if a reminder should be sent + sendReminder := endpointAlert.Triggered && endpointAlert.MinimumReminderInterval > 0 && time.Since(ep.LastReminderSent) >= endpointAlert.MinimumReminderInterval + // If neither initial alert nor reminder needs to be sent, skip to the next alert + if !sendInitialAlert && !sendReminder { + logr.Debugf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' is not due for triggering or reminding, skipping", ep.Name, endpointAlert.GetDescription()) continue } alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type) if alertProvider != nil { logr.Infof("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint with key=%s with description='%s' has been TRIGGERED", endpointAlert.Type, ep.Key(), endpointAlert.GetDescription()) var err error + alertType := "reminder" + if sendInitialAlert { + alertType = "initial" + } + log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, ep.Name, endpointAlert.GetDescription()) if os.Getenv("MOCK_ALERT_PROVIDER") == "true" { if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" { err = errors.New("error") @@ -48,7 +60,11 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert if err != nil { logr.Errorf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint with key=%s: %s", ep.Key(), err.Error()) } else { - endpointAlert.Triggered = true + // Mark initial alert as triggered and update last reminder time + if sendInitialAlert { + endpointAlert.Triggered = true + } + ep.LastReminderSent = time.Now() if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil { logr.Errorf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error()) } diff --git a/watchdog/alerting_test.go b/watchdog/alerting_test.go index 2dffd909..21bd7434 100644 --- a/watchdog/alerting_test.go +++ b/watchdog/alerting_test.go @@ -3,6 +3,7 @@ package watchdog import ( "os" "testing" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/alerting/alert" @@ -517,6 +518,48 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) { verify(t, ep, 0, 2, false, "") } +func TestHandleAlertingWithMinimumReminderInterval(t *testing.T) { + _ = os.Setenv("MOCK_ALERT_PROVIDER", "true") + defer os.Clearenv() + + cfg := &config.Config{ + Alerting: &alerting.Config{ + Custom: &custom.AlertProvider{ + DefaultConfig: custom.Config{ + URL: "https://twin.sh/health", + Method: "GET", + }, + }, + }, + } + enabled := true + ep := &endpoint.Endpoint{ + URL: "https://example.com", + Alerts: []*alert.Alert{ + { + Type: alert.TypeCustom, + Enabled: &enabled, + FailureThreshold: 2, + SuccessThreshold: 3, + SendOnResolved: &enabled, + Triggered: false, + MinimumReminderInterval: 1 * time.Second, + }, + }, + } + + verify(t, ep, 0, 0, false, "The alert shouldn't start triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting) + verify(t, ep, 1, 0, false, "The alert shouldn't have triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting) + verify(t, ep, 2, 0, true, "The alert should've triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting) + verify(t, ep, 3, 0, true, "The alert should still be triggered") + HandleAlerting(ep, &endpoint.Result{Success: false}, cfg.Alerting) + verify(t, ep, 4, 0, true, "The alert should still be triggered") + HandleAlerting(ep, &endpoint.Result{Success: true}, cfg.Alerting) +} + func verify(t *testing.T, ep *endpoint.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) { if ep.NumberOfFailuresInARow != expectedNumberOfFailuresInARow { t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, ep.NumberOfFailuresInARow)