feat(suite): Implement Suites (#1239)

* feat(suite): Implement Suites Fixes #1230 * Update docs * Fix variable alignment * Prevent always-run endpoint from running if a context placeholder fails to resolve in the URL * Return errors when a context placeholder path fails to resolve * Add a couple of unit tests * Add a couple of unit tests * fix(ui): Update group count properly Fixes #1233 * refactor: Pass down entire config instead of several sub-configs * fix: Change default suite interval and timeout * fix: Deprecate disable-monitoring-lock in favor of concurrency * fix: Make sure there are no duplicate keys * Refactor some code * Update watchdog/watchdog.go * Update web/app/src/components/StepDetailsModal.vue Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * chore: Remove useless log * fix: Set default concurrency to 3 instead of 5 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-09-05 15:39:12 -04:00
parent 10cabb9dde
commit d668a14703
74 changed files with 7513 additions and 652 deletions
--- a/watchdog/endpoint.go
+++ b/watchdog/endpoint.go
@@ -0,0 +1,80 @@
+package watchdog
+
+import (
+	"context"
+	"time"
+
+	"github.com/TwiN/gatus/v5/config"
+	"github.com/TwiN/gatus/v5/config/endpoint"
+	"github.com/TwiN/gatus/v5/metrics"
+	"github.com/TwiN/gatus/v5/storage/store"
+	"github.com/TwiN/logr"
+)
+
+// monitorEndpoint a single endpoint in a loop
+func monitorEndpoint(ep *endpoint.Endpoint, cfg *config.Config, extraLabels []string, ctx context.Context) {
+	// Run it immediately on start
+	executeEndpoint(ep, cfg, extraLabels)
+	// Loop for the next executions
+	ticker := time.NewTicker(ep.Interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			logr.Warnf("[watchdog.monitorEndpoint] Canceling current execution of group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key())
+			return
+		case <-ticker.C:
+			executeEndpoint(ep, cfg, extraLabels)
+		}
+	}
+	// Just in case somebody wandered all the way to here and wonders, "what about ExternalEndpoints?"
+	// Alerting is checked every time an external endpoint is pushed to Gatus, so they're not monitored
+	// periodically like they are for normal endpoints.
+}
+
+func executeEndpoint(ep *endpoint.Endpoint, cfg *config.Config, extraLabels []string) {
+	// Acquire semaphore to limit concurrent endpoint monitoring
+	if err := monitoringSemaphore.Acquire(ctx, 1); err != nil {
+		// Only fails if context is cancelled (during shutdown)
+		logr.Debugf("[watchdog.executeEndpoint] Context cancelled, skipping execution: %s", err.Error())
+		return
+	}
+	defer monitoringSemaphore.Release(1)
+	// If there's a connectivity checker configured, check if Gatus has internet connectivity
+	if cfg.Connectivity != nil && cfg.Connectivity.Checker != nil && !cfg.Connectivity.Checker.IsConnected() {
+		logr.Infof("[watchdog.executeEndpoint] No connectivity; skipping execution")
+		return
+	}
+	logr.Debugf("[watchdog.executeEndpoint] Monitoring group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key())
+	result := ep.EvaluateHealth()
+	if cfg.Metrics {
+		metrics.PublishMetricsForEndpoint(ep, result, extraLabels)
+	}
+	UpdateEndpointStatus(ep, result)
+	if logr.GetThreshold() == logr.LevelDebug && !result.Success {
+		logr.Debugf("[watchdog.executeEndpoint] Monitored group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s; body=%s", ep.Group, ep.Name, ep.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond), result.Body)
+	} else {
+		logr.Infof("[watchdog.executeEndpoint] Monitored group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s", ep.Group, ep.Name, ep.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond))
+	}
+	inEndpointMaintenanceWindow := false
+	for _, maintenanceWindow := range ep.MaintenanceWindows {
+		if maintenanceWindow.IsUnderMaintenance() {
+			logr.Debug("[watchdog.executeEndpoint] Under endpoint maintenance window")
+			inEndpointMaintenanceWindow = true
+		}
+	}
+	if !cfg.Maintenance.IsUnderMaintenance() && !inEndpointMaintenanceWindow {
+		// TODO: Consider moving this after the monitoring lock is unlocked? I mean, how much noise can a single alerting provider cause...
+		HandleAlerting(ep, result, cfg.Alerting)
+	} else {
+		logr.Debug("[watchdog.executeEndpoint] Not handling alerting because currently in the maintenance window")
+	}
+	logr.Debugf("[watchdog.executeEndpoint] Waiting for interval=%s before monitoring group=%s endpoint=%s (key=%s) again", ep.Interval, ep.Group, ep.Name, ep.Key())
+}
+
+// UpdateEndpointStatus persists the endpoint result in the storage
+func UpdateEndpointStatus(ep *endpoint.Endpoint, result *endpoint.Result) {
+	if err := store.Get().InsertEndpointResult(ep, result); err != nil {
+		logr.Errorf("[watchdog.UpdateEndpointStatus] Failed to insert result in storage: %s", err.Error())
+	}
+}
--- a/watchdog/external_endpoint.go
+++ b/watchdog/external_endpoint.go
@@ -0,0 +1,83 @@
+package watchdog
+
+import (
+	"context"
+	"time"
+
+	"github.com/TwiN/gatus/v5/config"
+	"github.com/TwiN/gatus/v5/config/endpoint"
+	"github.com/TwiN/gatus/v5/metrics"
+	"github.com/TwiN/gatus/v5/storage/store"
+	"github.com/TwiN/logr"
+)
+
+func monitorExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, cfg *config.Config, extraLabels []string, ctx context.Context) {
+	ticker := time.NewTicker(ee.Heartbeat.Interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			logr.Warnf("[watchdog.monitorExternalEndpointHeartbeat] Canceling current execution of group=%s; endpoint=%s; key=%s", ee.Group, ee.Name, ee.Key())
+			return
+		case <-ticker.C:
+			executeExternalEndpointHeartbeat(ee, cfg, extraLabels)
+		}
+	}
+}
+
+func executeExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, cfg *config.Config, extraLabels []string) {
+	// Acquire semaphore to limit concurrent external endpoint monitoring
+	if err := monitoringSemaphore.Acquire(ctx, 1); err != nil {
+		// Only fails if context is cancelled (during shutdown)
+		logr.Debugf("[watchdog.executeExternalEndpointHeartbeat] Context cancelled, skipping execution: %s", err.Error())
+		return
+	}
+	defer monitoringSemaphore.Release(1)
+	// If there's a connectivity checker configured, check if Gatus has internet connectivity
+	if cfg.Connectivity != nil && cfg.Connectivity.Checker != nil && !cfg.Connectivity.Checker.IsConnected() {
+		logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] No connectivity; skipping execution")
+		return
+	}
+	logr.Debugf("[watchdog.monitorExternalEndpointHeartbeat] Checking heartbeat for group=%s; endpoint=%s; key=%s", ee.Group, ee.Name, ee.Key())
+	convertedEndpoint := ee.ToEndpoint()
+	hasReceivedResultWithinHeartbeatInterval, err := store.Get().HasEndpointStatusNewerThan(ee.Key(), time.Now().Add(-ee.Heartbeat.Interval))
+	if err != nil {
+		logr.Errorf("[watchdog.monitorExternalEndpointHeartbeat] Failed to check if endpoint has received a result within the heartbeat interval: %s", err.Error())
+		return
+	}
+	if hasReceivedResultWithinHeartbeatInterval {
+		// If we received a result within the heartbeat interval, we don't want to create a successful result, so we
+		// skip the rest. We don't have to worry about alerting or metrics, because if the previous heartbeat failed
+		// while this one succeeds, it implies that there was a new result pushed, and that result being pushed
+		// should've resolved the alert.
+		logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] Checked heartbeat for group=%s; endpoint=%s; key=%s; success=%v; errors=%d", ee.Group, ee.Name, ee.Key(), hasReceivedResultWithinHeartbeatInterval, 0)
+		return
+	}
+	// All code after this point assumes the heartbeat failed
+	result := &endpoint.Result{
+		Timestamp: time.Now(),
+		Success:   false,
+		Errors:    []string{"heartbeat: no update received within " + ee.Heartbeat.Interval.String()},
+	}
+	if cfg.Metrics {
+		metrics.PublishMetricsForEndpoint(convertedEndpoint, result, extraLabels)
+	}
+	UpdateEndpointStatus(convertedEndpoint, result)
+	logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] Checked heartbeat for group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s", ee.Group, ee.Name, ee.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond))
+	inEndpointMaintenanceWindow := false
+	for _, maintenanceWindow := range ee.MaintenanceWindows {
+		if maintenanceWindow.IsUnderMaintenance() {
+			logr.Debug("[watchdog.monitorExternalEndpointHeartbeat] Under endpoint maintenance window")
+			inEndpointMaintenanceWindow = true
+		}
+	}
+	if !cfg.Maintenance.IsUnderMaintenance() && !inEndpointMaintenanceWindow {
+		HandleAlerting(convertedEndpoint, result, cfg.Alerting)
+		// Sync the failure/success counters back to the external endpoint
+		ee.NumberOfSuccessesInARow = convertedEndpoint.NumberOfSuccessesInARow
+		ee.NumberOfFailuresInARow = convertedEndpoint.NumberOfFailuresInARow
+	} else {
+		logr.Debug("[watchdog.monitorExternalEndpointHeartbeat] Not handling alerting because currently in the maintenance window")
+	}
+	logr.Debugf("[watchdog.monitorExternalEndpointHeartbeat] Waiting for interval=%s before checking heartbeat for group=%s endpoint=%s (key=%s) again", ee.Heartbeat.Interval, ee.Group, ee.Name, ee.Key())
+}
--- a/watchdog/suite.go
+++ b/watchdog/suite.go
@@ -0,0 +1,86 @@
+package watchdog
+
+import (
+	"context"
+	"time"
+
+	"github.com/TwiN/gatus/v5/config"
+	"github.com/TwiN/gatus/v5/config/suite"
+	"github.com/TwiN/gatus/v5/metrics"
+	"github.com/TwiN/gatus/v5/storage/store"
+	"github.com/TwiN/logr"
+)
+
+// monitorSuite monitors a suite by executing it at regular intervals
+func monitorSuite(s *suite.Suite, cfg *config.Config, extraLabels []string, ctx context.Context) {
+	// Execute immediately on start
+	executeSuite(s, cfg, extraLabels)
+	// Set up ticker for periodic execution
+	ticker := time.NewTicker(s.Interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			logr.Warnf("[watchdog.monitorSuite] Canceling monitoring for suite=%s", s.Name)
+			return
+		case <-ticker.C:
+			executeSuite(s, cfg, extraLabels)
+		}
+	}
+}
+
+// executeSuite executes a suite with proper concurrency control
+func executeSuite(s *suite.Suite, cfg *config.Config, extraLabels []string) {
+	// Acquire semaphore to limit concurrent suite monitoring
+	if err := monitoringSemaphore.Acquire(ctx, 1); err != nil {
+		// Only fails if context is cancelled (during shutdown)
+		logr.Debugf("[watchdog.executeSuite] Context cancelled, skipping execution: %s", err.Error())
+		return
+	}
+	defer monitoringSemaphore.Release(1)
+	// Check connectivity if configured
+	if cfg.Connectivity != nil && cfg.Connectivity.Checker != nil && !cfg.Connectivity.Checker.IsConnected() {
+		logr.Infof("[watchdog.executeSuite] No connectivity; skipping suite=%s", s.Name)
+		return
+	}
+	logr.Debugf("[watchdog.executeSuite] Monitoring group=%s; suite=%s; key=%s", s.Group, s.Name, s.Key())
+	// Execute the suite using its Execute method
+	result := s.Execute()
+	// Publish metrics for the suite execution
+	if cfg.Metrics {
+		metrics.PublishMetricsForSuite(s, result, extraLabels)
+	}
+	// Store individual endpoint results and handle alerting
+	for i, ep := range s.Endpoints {
+		if i < len(result.EndpointResults) {
+			epResult := result.EndpointResults[i]
+			// Store the endpoint result
+			UpdateEndpointStatus(ep, epResult)
+			// Handle alerting if configured and not under maintenance
+			if cfg.Alerting != nil && !cfg.Maintenance.IsUnderMaintenance() {
+				// Check if endpoint is under maintenance
+				inEndpointMaintenanceWindow := false
+				for _, maintenanceWindow := range ep.MaintenanceWindows {
+					if maintenanceWindow.IsUnderMaintenance() {
+						logr.Debug("[watchdog.executeSuite] Endpoint under maintenance window")
+						inEndpointMaintenanceWindow = true
+						break
+					}
+				}
+				if !inEndpointMaintenanceWindow {
+					HandleAlerting(ep, epResult, cfg.Alerting)
+				}
+			}
+		}
+	}
+	logr.Infof("[watchdog.executeSuite] Completed suite=%s; success=%v; errors=%d; duration=%v; endpoints_executed=%d/%d", s.Name, result.Success, len(result.Errors), result.Duration, len(result.EndpointResults), len(s.Endpoints))
+	// Store result in database
+	UpdateSuiteStatus(s, result)
+}
+
+// UpdateSuiteStatus persists the suite result in the database
+func UpdateSuiteStatus(s *suite.Suite, result *suite.Result) {
+	if err := store.Get().InsertSuiteResult(s, result); err != nil {
+		logr.Errorf("[watchdog.executeSuite] Failed to insert suite result for suite=%s: %v", s.Name, err)
+	}
+}
--- a/watchdog/watchdog.go
+++ b/watchdog/watchdog.go
@@ -2,23 +2,22 @@ package watchdog

 import (
 	"context"
-	"sync"
 	"time"

-	"github.com/TwiN/gatus/v5/alerting"
 	"github.com/TwiN/gatus/v5/config"
-	"github.com/TwiN/gatus/v5/config/connectivity"
-	"github.com/TwiN/gatus/v5/config/endpoint"
-	"github.com/TwiN/gatus/v5/config/maintenance"
-	"github.com/TwiN/gatus/v5/metrics"
-	"github.com/TwiN/gatus/v5/storage/store"
-	"github.com/TwiN/logr"
+	"golang.org/x/sync/semaphore"
+)
+
+const (
+	// UnlimitedConcurrencyWeight is the semaphore weight used when concurrency is set to 0 (unlimited).
+	// This provides a practical upper limit while allowing very high concurrency for large deployments.
+	UnlimitedConcurrencyWeight = 10000
 )

 var (
-	// monitoringMutex is used to prevent multiple endpoint from being evaluated at the same time.
+	// monitoringSemaphore is used to limit the number of endpoints/suites that can be evaluated concurrently.
 	// Without this, conditions using response time may become inaccurate.
-	monitoringMutex sync.Mutex
+	monitoringSemaphore *semaphore.Weighted

 	ctx        context.Context
 	cancelFunc context.CancelFunc
@@ -27,12 +26,20 @@ var (
 // Monitor loops over each endpoint and starts a goroutine to monitor each endpoint separately
 func Monitor(cfg *config.Config) {
 	ctx, cancelFunc = context.WithCancel(context.Background())
+	// Initialize semaphore based on concurrency configuration
+	if cfg.Concurrency == 0 {
+		// Unlimited concurrency - use a very high limit
+		monitoringSemaphore = semaphore.NewWeighted(UnlimitedConcurrencyWeight)
+	} else {
+		// Limited concurrency based on configuration
+		monitoringSemaphore = semaphore.NewWeighted(int64(cfg.Concurrency))
+	}
 	extraLabels := cfg.GetUniqueExtraMetricLabels()
 	for _, endpoint := range cfg.Endpoints {
 		if endpoint.IsEnabled() {
 			// To prevent multiple requests from running at the same time, we'll wait for a little before each iteration
-			time.Sleep(777 * time.Millisecond)
-			go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, extraLabels, ctx)
+			time.Sleep(222 * time.Millisecond)
+			go monitorEndpoint(endpoint, cfg, extraLabels, ctx)
 		}
 	}
 	for _, externalEndpoint := range cfg.ExternalEndpoints {
@@ -40,153 +47,27 @@ func Monitor(cfg *config.Config) {
 		// If the external endpoint does not use heartbeat, then it does not need to be monitored periodically, because
 		// alerting is checked every time an external endpoint is pushed to Gatus, unlike normal endpoints.
 		if externalEndpoint.IsEnabled() && externalEndpoint.Heartbeat.Interval > 0 {
-			go monitorExternalEndpointHeartbeat(externalEndpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, ctx, extraLabels)
+			go monitorExternalEndpointHeartbeat(externalEndpoint, cfg, extraLabels, ctx)
 		}
 	}
-}
-
-// monitor a single endpoint in a loop
-func monitor(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, extraLabels []string, ctx context.Context) {
-	// Run it immediately on start
-	execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, extraLabels)
-	// Loop for the next executions
-	ticker := time.NewTicker(ep.Interval)
-	defer ticker.Stop()
-	for {
-		select {
-		case <-ctx.Done():
-			logr.Warnf("[watchdog.monitor] Canceling current execution of group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key())
-			return
-		case <-ticker.C:
-			execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, extraLabels)
+	for _, suite := range cfg.Suites {
+		if suite.IsEnabled() {
+			time.Sleep(222 * time.Millisecond)
+			go monitorSuite(suite, cfg, extraLabels, ctx)
 		}
 	}
-	// Just in case somebody wandered all the way to here and wonders, "what about ExternalEndpoints?"
-	// Alerting is checked every time an external endpoint is pushed to Gatus, so they're not monitored
-	// periodically like they are for normal endpoints.
-}
-
-func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, extraLabels []string) {
-	if !disableMonitoringLock {
-		// By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which
-		// could cause performance issues and return inaccurate results
-		monitoringMutex.Lock()
-		defer monitoringMutex.Unlock()
-	}
-	// If there's a connectivity checker configured, check if Gatus has internet connectivity
-	if connectivityConfig != nil && connectivityConfig.Checker != nil && !connectivityConfig.Checker.IsConnected() {
-		logr.Infof("[watchdog.execute] No connectivity; skipping execution")
-		return
-	}
-	logr.Debugf("[watchdog.execute] Monitoring group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key())
-	result := ep.EvaluateHealth()
-	if enabledMetrics {
-		metrics.PublishMetricsForEndpoint(ep, result, extraLabels)
-	}
-	UpdateEndpointStatuses(ep, result)
-	if logr.GetThreshold() == logr.LevelDebug && !result.Success {
-		logr.Debugf("[watchdog.execute] Monitored group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s; body=%s", ep.Group, ep.Name, ep.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond), result.Body)
-	} else {
-		logr.Infof("[watchdog.execute] Monitored group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s", ep.Group, ep.Name, ep.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond))
-	}
-	inEndpointMaintenanceWindow := false
-	for _, maintenanceWindow := range ep.MaintenanceWindows {
-		if maintenanceWindow.IsUnderMaintenance() {
-			logr.Debug("[watchdog.execute] Under endpoint maintenance window")
-			inEndpointMaintenanceWindow = true
-		}
-	}
-	if !maintenanceConfig.IsUnderMaintenance() && !inEndpointMaintenanceWindow {
-		// TODO: Consider moving this after the monitoring lock is unlocked? I mean, how much noise can a single alerting provider cause...
-		HandleAlerting(ep, result, alertingConfig)
-	} else {
-		logr.Debug("[watchdog.execute] Not handling alerting because currently in the maintenance window")
-	}
-	logr.Debugf("[watchdog.execute] Waiting for interval=%s before monitoring group=%s endpoint=%s (key=%s) again", ep.Interval, ep.Group, ep.Name, ep.Key())
-}
-
-func monitorExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, ctx context.Context, extraLabels []string) {
-	ticker := time.NewTicker(ee.Heartbeat.Interval)
-	defer ticker.Stop()
-	for {
-		select {
-		case <-ctx.Done():
-			logr.Warnf("[watchdog.monitorExternalEndpointHeartbeat] Canceling current execution of group=%s; endpoint=%s; key=%s", ee.Group, ee.Name, ee.Key())
-			return
-		case <-ticker.C:
-			executeExternalEndpointHeartbeat(ee, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, extraLabels)
-		}
-	}
-}
-
-func executeExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, extraLabels []string) {
-	if !disableMonitoringLock {
-		// By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which
-		// could cause performance issues and return inaccurate results
-		monitoringMutex.Lock()
-		defer monitoringMutex.Unlock()
-	}
-	// If there's a connectivity checker configured, check if Gatus has internet connectivity
-	if connectivityConfig != nil && connectivityConfig.Checker != nil && !connectivityConfig.Checker.IsConnected() {
-		logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] No connectivity; skipping execution")
-		return
-	}
-	logr.Debugf("[watchdog.monitorExternalEndpointHeartbeat] Checking heartbeat for group=%s; endpoint=%s; key=%s", ee.Group, ee.Name, ee.Key())
-	convertedEndpoint := ee.ToEndpoint()
-	hasReceivedResultWithinHeartbeatInterval, err := store.Get().HasEndpointStatusNewerThan(ee.Key(), time.Now().Add(-ee.Heartbeat.Interval))
-	if err != nil {
-		logr.Errorf("[watchdog.monitorExternalEndpointHeartbeat] Failed to check if endpoint has received a result within the heartbeat interval: %s", err.Error())
-		return
-	}
-	if hasReceivedResultWithinHeartbeatInterval {
-		// If we received a result within the heartbeat interval, we don't want to create a successful result, so we
-		// skip the rest. We don't have to worry about alerting or metrics, because if the previous heartbeat failed
-		// while this one succeeds, it implies that there was a new result pushed, and that result being pushed
-		// should've resolved the alert.
-		logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] Checked heartbeat for group=%s; endpoint=%s; key=%s; success=%v; errors=%d", ee.Group, ee.Name, ee.Key(), hasReceivedResultWithinHeartbeatInterval, 0)
-		return
-	}
-	// All code after this point assumes the heartbeat failed
-	result := &endpoint.Result{
-		Timestamp: time.Now(),
-		Success:   false,
-		Errors:    []string{"heartbeat: no update received within " + ee.Heartbeat.Interval.String()},
-	}
-	if enabledMetrics {
-		metrics.PublishMetricsForEndpoint(convertedEndpoint, result, extraLabels)
-	}
-	UpdateEndpointStatuses(convertedEndpoint, result)
-	logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] Checked heartbeat for group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s", ee.Group, ee.Name, ee.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond))
-	inEndpointMaintenanceWindow := false
-	for _, maintenanceWindow := range ee.MaintenanceWindows {
-		if maintenanceWindow.IsUnderMaintenance() {
-			logr.Debug("[watchdog.monitorExternalEndpointHeartbeat] Under endpoint maintenance window")
-			inEndpointMaintenanceWindow = true
-		}
-	}
-	if !maintenanceConfig.IsUnderMaintenance() && !inEndpointMaintenanceWindow {
-		HandleAlerting(convertedEndpoint, result, alertingConfig)
-		// Sync the failure/success counters back to the external endpoint
-		ee.NumberOfSuccessesInARow = convertedEndpoint.NumberOfSuccessesInARow
-		ee.NumberOfFailuresInARow = convertedEndpoint.NumberOfFailuresInARow
-	} else {
-		logr.Debug("[watchdog.monitorExternalEndpointHeartbeat] Not handling alerting because currently in the maintenance window")
-	}
-	logr.Debugf("[watchdog.monitorExternalEndpointHeartbeat] Waiting for interval=%s before checking heartbeat for group=%s endpoint=%s (key=%s) again", ee.Heartbeat.Interval, ee.Group, ee.Name, ee.Key())
-}
-
-// UpdateEndpointStatuses updates the slice of endpoint statuses
-func UpdateEndpointStatuses(ep *endpoint.Endpoint, result *endpoint.Result) {
-	if err := store.Get().Insert(ep, result); err != nil {
-		logr.Errorf("[watchdog.UpdateEndpointStatuses] Failed to insert result in storage: %s", err.Error())
-	}
 }

 // Shutdown stops monitoring all endpoints
 func Shutdown(cfg *config.Config) {
-	// Disable all the old HTTP connections
+	// Stop in-flight HTTP connections
 	for _, ep := range cfg.Endpoints {
 		ep.Close()
 	}
+	for _, s := range cfg.Suites {
+		for _, ep := range s.Endpoints {
+			ep.Close()
+		}
+	}
 	cancelFunc()
 }