diff --git a/README.md b/README.md index fdf5202d..71af2b0f 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ Have any feedback or questions? [Create a discussion](https://github.com/TwiN/ga - [OIDC](#oidc) - [TLS Encryption](#tls-encryption) - [Metrics](#metrics) + - [Custom Labels](#custom-labels) - [Connectivity](#connectivity) - [Remote instances (EXPERIMENTAL)](#remote-instances-experimental) - [Deployment](#deployment) @@ -1949,6 +1950,23 @@ endpoint on the same port your application is configured to run on (`web.port`). See [examples/docker-compose-grafana-prometheus](.examples/docker-compose-grafana-prometheus) for further documentation as well as an example. +#### Custom Labels + +Added a Labels field to the Config and Endpoint structs to support key-value pairs for metrics. Updated the Prometheus metrics initialization to include dynamic labels from the configuration. See the example below: + +```yaml +endpoints: + - name: front-end + group: core + url: "https://twin.sh/health" + interval: 5m + conditions: + - "[STATUS] == 200" + - "[BODY].status == UP" + - "[RESPONSE_TIME] < 150" + labels: + environment: staging +``` ### Connectivity | Parameter | Description | Default | @@ -2183,7 +2201,7 @@ This works for SCTP based application. ### Monitoring a WebSocket endpoint -By prefixing `endpoints[].url` with `ws://` or `wss://`, you can monitor WebSocket endpoints: +By prefixing `endpoints[].url` with `ws://` or `wss://`, you can monitor WebSocket endpoints at a very basic level: ```yaml endpoints: - name: example diff --git a/api/external_endpoint.go b/api/external_endpoint.go index c2c0fce6..88c7e444 100644 --- a/api/external_endpoint.go +++ b/api/external_endpoint.go @@ -16,6 +16,7 @@ import ( ) func CreateExternalEndpointResult(cfg *config.Config) fiber.Handler { + extraLabels := cfg.GetUniqueExtraMetricLabels() return func(c *fiber.Ctx) error { // Check if the success query parameter is present success, exists := c.Queries()["success"] @@ -74,7 +75,7 @@ func CreateExternalEndpointResult(cfg *config.Config) fiber.Handler { externalEndpoint.NumberOfFailuresInARow = convertedEndpoint.NumberOfFailuresInARow } if cfg.Metrics { - metrics.PublishMetricsForEndpoint(convertedEndpoint, result) + metrics.PublishMetricsForEndpoint(convertedEndpoint, result, extraLabels) } // Return the result return c.Status(200).SendString("") diff --git a/config/config.go b/config/config.go index f7056a72..ca5247bb 100644 --- a/config/config.go +++ b/config/config.go @@ -102,6 +102,25 @@ type Config struct { lastFileModTime time.Time // last modification time } +// GetUniqueExtraMetricLabels returns a slice of unique metric labels from all enabled endpoints +// in the configuration. It iterates through each endpoint, checks if it is enabled, +// and then collects unique labels from the endpoint's labels map. +func (config *Config) GetUniqueExtraMetricLabels() []string { + labels := make([]string, 0) + for _, ep := range config.Endpoints { + if !ep.IsEnabled() { + continue + } + for label := range ep.ExtraLabels { + if contains(labels, label) { + continue + } + labels = append(labels, label) + } + } + return labels +} + func (config *Config) GetEndpointByKey(key string) *endpoint.Endpoint { for i := 0; i < len(config.Endpoints); i++ { ep := config.Endpoints[i] diff --git a/config/config_test.go b/config/config_test.go index 12ee5922..1b51090b 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -124,7 +124,7 @@ endpoints: name: "dir-with-two-config-files", configPath: dir, pathAndFiles: map[string]string{ - "config.yaml": `endpoints: + "config.yaml": `endpoints: - name: one url: https://example.com conditions: @@ -135,7 +135,7 @@ endpoints: url: https://example.org conditions: - "len([BODY]) > 0"`, - "config.yml": `endpoints: + "config.yml": `endpoints: - name: three url: https://twin.sh/health conditions: @@ -237,7 +237,7 @@ endpoints: for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for path, content := range scenario.pathAndFiles { - if err := os.WriteFile(filepath.Join(dir, path), []byte(content), 0644); err != nil { + if err := os.WriteFile(filepath.Join(dir, path), []byte(content), 0o644); err != nil { t.Fatalf("[%s] failed to write file: %v", scenario.name, err) } } @@ -282,7 +282,7 @@ func TestConfig_HasLoadedConfigurationBeenModified(t *testing.T) { url: https://twin.sh/health conditions: - "[STATUS] == 200" -`), 0644) +`), 0o644) t.Run("config-file-as-config-path", func(t *testing.T) { config, err := LoadConfiguration(configFilePath) @@ -298,7 +298,7 @@ func TestConfig_HasLoadedConfigurationBeenModified(t *testing.T) { - name: website url: https://twin.sh/health conditions: - - "[STATUS] == 200"`), 0644); err != nil { + - "[STATUS] == 200"`), 0o644); err != nil { t.Fatalf("failed to overwrite config file: %v", err) } if !config.HasLoadedConfigurationBeenModified() { @@ -315,7 +315,7 @@ func TestConfig_HasLoadedConfigurationBeenModified(t *testing.T) { } time.Sleep(time.Second) // Because the file mod time only has second precision, we have to wait for a second // Update the config file - if err = os.WriteFile(filepath.Join(dir, "metrics.yaml"), []byte(`metrics: true`), 0644); err != nil { + if err = os.WriteFile(filepath.Join(dir, "metrics.yaml"), []byte(`metrics: true`), 0o644); err != nil { t.Fatalf("failed to overwrite config file: %v", err) } if !config.HasLoadedConfigurationBeenModified() { @@ -713,7 +713,7 @@ func TestParseAndValidateBadConfigBytes(t *testing.T) { _, err := parseAndValidateConfigBytes([]byte(` badconfig: - asdsa: w0w - usadasdrl: asdxzczxc + usadasdrl: asdxzczxc asdas: - soup `)) @@ -1943,3 +1943,114 @@ func TestGetAlertingProviderByAlertType(t *testing.T) { }) } } + +func TestConfig_GetUniqueExtraMetricLabels(t *testing.T) { + tests := []struct { + name string + config *Config + expected []string + }{ + { + name: "no-endpoints", + config: &Config{ + Endpoints: []*endpoint.Endpoint{}, + }, + expected: []string{}, + }, + { + name: "single-endpoint-no-labels", + config: &Config{ + Endpoints: []*endpoint.Endpoint{ + { + Name: "endpoint1", + URL: "https://example.com", + }, + }, + }, + expected: []string{}, + }, + { + name: "single-endpoint-with-labels", + config: &Config{ + Endpoints: []*endpoint.Endpoint{ + { + Name: "endpoint1", + URL: "https://example.com", + Enabled: toPtr(true), + ExtraLabels: map[string]string{ + "env": "production", + "team": "backend", + }, + }, + }, + }, + expected: []string{"env", "team"}, + }, + { + name: "multiple-endpoints-with-labels", + config: &Config{ + Endpoints: []*endpoint.Endpoint{ + { + Name: "endpoint1", + URL: "https://example.com", + Enabled: toPtr(true), + ExtraLabels: map[string]string{ + "env": "production", + "team": "backend", + "module": "auth", + }, + }, + { + Name: "endpoint2", + URL: "https://example.org", + Enabled: toPtr(true), + ExtraLabels: map[string]string{ + "env": "staging", + "team": "frontend", + }, + }, + }, + }, + expected: []string{"env", "team", "module"}, + }, + { + name: "multiple-endpoints-with-some-disabled", + config: &Config{ + Endpoints: []*endpoint.Endpoint{ + { + Name: "endpoint1", + URL: "https://example.com", + Enabled: toPtr(true), + ExtraLabels: map[string]string{ + "env": "production", + "team": "backend", + }, + }, + { + Name: "endpoint2", + URL: "https://example.org", + Enabled: toPtr(false), + ExtraLabels: map[string]string{ + "module": "auth", + }, + }, + }, + }, + expected: []string{"env", "team"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + labels := tt.config.GetUniqueExtraMetricLabels() + if len(labels) != len(tt.expected) { + t.Errorf("expected %d labels, got %d", len(tt.expected), len(labels)) + } + for _, label := range tt.expected { + if !contains(labels, label) { + t.Errorf("expected label %s to be present", label) + } + } + }) + } +} diff --git a/config/endpoint/endpoint.go b/config/endpoint/endpoint.go index f72f675f..fd1bc305 100644 --- a/config/endpoint/endpoint.go +++ b/config/endpoint/endpoint.go @@ -99,6 +99,9 @@ type Endpoint struct { // Headers of the request Headers map[string]string `yaml:"headers,omitempty"` + // ExtraLabels are key-value pairs that can be used to metric the endpoint + ExtraLabels map[string]string `yaml:"extra-labels,omitempty"` + // Interval is the duration to wait between every status check Interval time.Duration `yaml:"interval,omitempty"` @@ -417,8 +420,7 @@ func (e *Endpoint) call(result *Result) { } else if endpointType == TypeSSH { // If there's no username/password specified, attempt to validate just the SSH banner if len(e.SSHConfig.Username) == 0 && len(e.SSHConfig.Password) == 0 { - result.Connected, result.HTTPStatus, err = - client.CheckSSHBanner(strings.TrimPrefix(e.URL, "ssh://"), e.ClientConfig) + result.Connected, result.HTTPStatus, err = client.CheckSSHBanner(strings.TrimPrefix(e.URL, "ssh://"), e.ClientConfig) if err != nil { result.AddError(err.Error()) return diff --git a/config/util.go b/config/util.go new file mode 100644 index 00000000..cffeaf0f --- /dev/null +++ b/config/util.go @@ -0,0 +1,16 @@ +package config + +// toPtr returns a pointer to the given value +func toPtr[T any](value T) *T { + return &value +} + +// contains checks if a key exists in the slice +func contains[T comparable](slice []T, key T) bool { + for _, item := range slice { + if item == key { + return true + } + } + return false +} diff --git a/main.go b/main.go index bbb6e0c1..3ecc3769 100644 --- a/main.go +++ b/main.go @@ -9,6 +9,7 @@ import ( "github.com/TwiN/gatus/v5/config" "github.com/TwiN/gatus/v5/controller" + "github.com/TwiN/gatus/v5/metrics" "github.com/TwiN/gatus/v5/storage/store" "github.com/TwiN/gatus/v5/watchdog" "github.com/TwiN/logr" @@ -49,6 +50,7 @@ func main() { func start(cfg *config.Config) { go controller.Handle(cfg) + metrics.InitializePrometheusMetrics(cfg, nil) watchdog.Monitor(cfg) go listenToConfigurationFileChanges(cfg) } diff --git a/metrics/metrics.go b/metrics/metrics.go index 990e4a5e..ef5ec3e6 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -3,16 +3,14 @@ package metrics import ( "strconv" + "github.com/TwiN/gatus/v5/config" "github.com/TwiN/gatus/v5/config/endpoint" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" ) const namespace = "gatus" // The prefix of the metrics var ( - initializedMetrics bool // Whether the metrics have been initialized - resultTotal *prometheus.CounterVec resultDurationSeconds *prometheus.GaugeVec resultConnectedTotal *prometheus.CounterVec @@ -21,64 +19,79 @@ var ( resultEndpointSuccess *prometheus.GaugeVec ) -func initializePrometheusMetrics() { - resultTotal = promauto.NewCounterVec(prometheus.CounterOpts{ +func InitializePrometheusMetrics(cfg *config.Config, reg prometheus.Registerer) { + if reg == nil { + reg = prometheus.DefaultRegisterer + } + extraLabels := cfg.GetUniqueExtraMetricLabels() + resultTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Name: "results_total", Help: "Number of results per endpoint", - }, []string{"key", "group", "name", "type", "success"}) - resultDurationSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{ + }, append([]string{"key", "group", "name", "type", "success"}, extraLabels...)) + reg.MustRegister(resultTotal) + resultDurationSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Name: "results_duration_seconds", Help: "Duration of the request in seconds", - }, []string{"key", "group", "name", "type"}) - resultConnectedTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + }, append([]string{"key", "group", "name", "type"}, extraLabels...)) + reg.MustRegister(resultDurationSeconds) + resultConnectedTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Name: "results_connected_total", Help: "Total number of results in which a connection was successfully established", - }, []string{"key", "group", "name", "type"}) - resultCodeTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + }, append([]string{"key", "group", "name", "type"}, extraLabels...)) + reg.MustRegister(resultConnectedTotal) + resultCodeTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Name: "results_code_total", Help: "Total number of results by code", - }, []string{"key", "group", "name", "type", "code"}) - resultCertificateExpirationSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{ + }, append([]string{"key", "group", "name", "type", "code"}, extraLabels...)) + reg.MustRegister(resultCodeTotal) + resultCertificateExpirationSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Name: "results_certificate_expiration_seconds", Help: "Number of seconds until the certificate expires", - }, []string{"key", "group", "name", "type"}) - resultEndpointSuccess = promauto.NewGaugeVec(prometheus.GaugeOpts{ + }, append([]string{"key", "group", "name", "type"}, extraLabels...)) + reg.MustRegister(resultCertificateExpirationSeconds) + resultEndpointSuccess = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Name: "results_endpoint_success", Help: "Displays whether or not the endpoint was a success", - }, []string{"key", "group", "name", "type"}) + }, append([]string{"key", "group", "name", "type"}, extraLabels...)) + reg.MustRegister(resultEndpointSuccess) } // PublishMetricsForEndpoint publishes metrics for the given endpoint and its result. // These metrics will be exposed at /metrics if the metrics are enabled -func PublishMetricsForEndpoint(ep *endpoint.Endpoint, result *endpoint.Result) { - if !initializedMetrics { - initializePrometheusMetrics() - initializedMetrics = true +func PublishMetricsForEndpoint(ep *endpoint.Endpoint, result *endpoint.Result, extraLabels []string) { + labelValues := []string{} + for _, label := range extraLabels { + if value, ok := ep.ExtraLabels[label]; ok { + labelValues = append(labelValues, value) + } else { + labelValues = append(labelValues, "") + } } + endpointType := ep.Type() - resultTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.FormatBool(result.Success)).Inc() - resultDurationSeconds.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(result.Duration.Seconds()) + resultTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.FormatBool(result.Success)}, labelValues...)...).Inc() + resultDurationSeconds.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(result.Duration.Seconds()) if result.Connected { - resultConnectedTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Inc() + resultConnectedTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Inc() } if result.DNSRCode != "" { - resultCodeTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType), result.DNSRCode).Inc() + resultCodeTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType), result.DNSRCode}, labelValues...)...).Inc() } if result.HTTPStatus != 0 { - resultCodeTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.Itoa(result.HTTPStatus)).Inc() + resultCodeTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.Itoa(result.HTTPStatus)}, labelValues...)...).Inc() } if result.CertificateExpiration != 0 { - resultCertificateExpirationSeconds.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(result.CertificateExpiration.Seconds()) + resultCertificateExpirationSeconds.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(result.CertificateExpiration.Seconds()) } if result.Success { - resultEndpointSuccess.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(1) + resultEndpointSuccess.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(1) } else { - resultEndpointSuccess.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(0) + resultEndpointSuccess.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(0) } } diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go index 73a680d9..25d6c9c3 100644 --- a/metrics/metrics_test.go +++ b/metrics/metrics_test.go @@ -5,13 +5,110 @@ import ( "testing" "time" + "github.com/TwiN/gatus/v5/config" "github.com/TwiN/gatus/v5/config/endpoint" "github.com/TwiN/gatus/v5/config/endpoint/dns" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" ) +// TestInitializePrometheusMetrics tests metrics initialization with extraLabels. +// Note: Because of the global Prometheus registry, this test can only safely verify one label set per process. +// If the function is called with a different set of labels for the same metric, a panic will occur. +func TestInitializePrometheusMetrics(t *testing.T) { + cfgWithExtras := &config.Config{ + Endpoints: []*endpoint.Endpoint{ + { + Name: "TestEP", + Group: "G", + URL: "http://x/", + ExtraLabels: map[string]string{ + "foo": "foo-val", + "hello": "world-val", + }, + }, + }, + } + reg := prometheus.NewRegistry() + InitializePrometheusMetrics(cfgWithExtras, reg) + // Metrics variables should be non-nil + if resultTotal == nil { + t.Error("resultTotal metric not initialized") + } + if resultDurationSeconds == nil { + t.Error("resultDurationSeconds metric not initialized") + } + if resultConnectedTotal == nil { + t.Error("resultConnectedTotal metric not initialized") + } + if resultCodeTotal == nil { + t.Error("resultCodeTotal metric not initialized") + } + if resultCertificateExpirationSeconds == nil { + t.Error("resultCertificateExpirationSeconds metric not initialized") + } + if resultEndpointSuccess == nil { + t.Error("resultEndpointSuccess metric not initialized") + } + + defer func() { + if r := recover(); r != nil { + t.Errorf("resultTotal.WithLabelValues panicked: %v", r) + } + }() + _ = resultTotal.WithLabelValues("k", "g", "n", "ty", "true", "fval", "hval") +} + +// TestPublishMetricsForEndpoint_withExtraLabels ensures extraLabels are included in the exported metrics. +func TestPublishMetricsForEndpoint_withExtraLabels(t *testing.T) { + // Only test one label set per process due to Prometheus registry limits. + reg := prometheus.NewRegistry() + InitializePrometheusMetrics(&config.Config{ + Endpoints: []*endpoint.Endpoint{ + { + Name: "ep-extra", + URL: "https://sample.com", + ExtraLabels: map[string]string{ + "foo": "my-foo", + "bar": "my-bar", + }, + }, + }, + }, reg) + + ep := &endpoint.Endpoint{ + Name: "ep-extra", + Group: "g1", + URL: "https://sample.com", + ExtraLabels: map[string]string{ + "foo": "my-foo", + "bar": "my-bar", + }, + } + result := &endpoint.Result{ + HTTPStatus: 200, + Connected: true, + Duration: 2340 * time.Millisecond, + Success: true, + } + // Order of extraLabels as per GetUniqueExtraMetricLabels is ["foo", "bar"] + PublishMetricsForEndpoint(ep, result, []string{"foo", "bar"}) + + expected := ` +# HELP gatus_results_total Number of results per endpoint +# TYPE gatus_results_total counter +gatus_results_total{bar="my-bar",foo="my-foo",group="g1",key="g1_ep-extra",name="ep-extra",success="true",type="HTTP"} 1 +` + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expected), "gatus_results_total") + if err != nil { + t.Error("metrics export does not include extraLabels as expected:", err) + } +} + func TestPublishMetricsForEndpoint(t *testing.T) { + reg := prometheus.NewRegistry() + InitializePrometheusMetrics(&config.Config{}, reg) + httpEndpoint := &endpoint.Endpoint{Name: "http-ep-name", Group: "http-ep-group", URL: "https://example.org"} PublishMetricsForEndpoint(httpEndpoint, &endpoint.Result{ HTTPStatus: 200, @@ -23,8 +120,8 @@ func TestPublishMetricsForEndpoint(t *testing.T) { }, Success: true, CertificateExpiration: 49 * time.Hour, - }) - err := testutil.GatherAndCompare(prometheus.Gatherers{prometheus.DefaultGatherer}, bytes.NewBufferString(` + }, []string{}) + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP gatus_results_code_total Total number of results by code # TYPE gatus_results_code_total counter gatus_results_code_total{code="200",group="http-ep-group",key="http-ep-group_http-ep-name",name="http-ep-name",type="HTTP"} 1 @@ -57,8 +154,8 @@ gatus_results_endpoint_success{group="http-ep-group",key="http-ep-group_http-ep- }, Success: false, CertificateExpiration: 47 * time.Hour, - }) - err = testutil.GatherAndCompare(prometheus.Gatherers{prometheus.DefaultGatherer}, bytes.NewBufferString(` + }, []string{}) + err = testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP gatus_results_code_total Total number of results by code # TYPE gatus_results_code_total counter gatus_results_code_total{code="200",group="http-ep-group",key="http-ep-group_http-ep-name",name="http-ep-name",type="HTTP"} 2 @@ -82,10 +179,12 @@ gatus_results_endpoint_success{group="http-ep-group",key="http-ep-group_http-ep- if err != nil { t.Errorf("Expected no errors but got: %v", err) } - dnsEndpoint := &endpoint.Endpoint{Name: "dns-ep-name", Group: "dns-ep-group", URL: "8.8.8.8", DNSConfig: &dns.Config{ - QueryType: "A", - QueryName: "example.com.", - }} + dnsEndpoint := &endpoint.Endpoint{ + Name: "dns-ep-name", Group: "dns-ep-group", URL: "8.8.8.8", DNSConfig: &dns.Config{ + QueryType: "A", + QueryName: "example.com.", + }, + } PublishMetricsForEndpoint(dnsEndpoint, &endpoint.Result{ DNSRCode: "NOERROR", Connected: true, @@ -94,8 +193,8 @@ gatus_results_endpoint_success{group="http-ep-group",key="http-ep-group_http-ep- {Condition: "[DNS_RCODE] == NOERROR", Success: true}, }, Success: true, - }) - err = testutil.GatherAndCompare(prometheus.Gatherers{prometheus.DefaultGatherer}, bytes.NewBufferString(` + }, []string{}) + err = testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP gatus_results_code_total Total number of results by code # TYPE gatus_results_code_total counter gatus_results_code_total{code="200",group="http-ep-group",key="http-ep-group_http-ep-name",name="http-ep-name",type="HTTP"} 2 diff --git a/watchdog/watchdog.go b/watchdog/watchdog.go index 1a8f14c4..be3ce607 100644 --- a/watchdog/watchdog.go +++ b/watchdog/watchdog.go @@ -27,11 +27,12 @@ var ( // Monitor loops over each endpoint and starts a goroutine to monitor each endpoint separately func Monitor(cfg *config.Config) { ctx, cancelFunc = context.WithCancel(context.Background()) + extraLabels := cfg.GetUniqueExtraMetricLabels() for _, endpoint := range cfg.Endpoints { if endpoint.IsEnabled() { // To prevent multiple requests from running at the same time, we'll wait for a little before each iteration time.Sleep(777 * time.Millisecond) - go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, ctx) + go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, extraLabels, ctx) } } for _, externalEndpoint := range cfg.ExternalEndpoints { @@ -39,15 +40,15 @@ func Monitor(cfg *config.Config) { // If the external endpoint does not use heartbeat, then it does not need to be monitored periodically, because // alerting is checked every time an external endpoint is pushed to Gatus, unlike normal endpoints. if externalEndpoint.IsEnabled() && externalEndpoint.Heartbeat.Interval > 0 { - go monitorExternalEndpointHeartbeat(externalEndpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, ctx) + go monitorExternalEndpointHeartbeat(externalEndpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, ctx, extraLabels) } } } // monitor a single endpoint in a loop -func monitor(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, ctx context.Context) { +func monitor(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, extraLabels []string, ctx context.Context) { // Run it immediately on start - execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics) + execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, extraLabels) // Loop for the next executions ticker := time.NewTicker(ep.Interval) defer ticker.Stop() @@ -57,7 +58,7 @@ func monitor(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenance logr.Warnf("[watchdog.monitor] Canceling current execution of group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key()) return case <-ticker.C: - execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics) + execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, extraLabels) } } // Just in case somebody wandered all the way to here and wonders, "what about ExternalEndpoints?" @@ -65,7 +66,7 @@ func monitor(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenance // periodically like they are for normal endpoints. } -func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool) { +func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, extraLabels []string) { if !disableMonitoringLock { // By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which // could cause performance issues and return inaccurate results @@ -80,7 +81,7 @@ func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenance logr.Debugf("[watchdog.execute] Monitoring group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key()) result := ep.EvaluateHealth() if enabledMetrics { - metrics.PublishMetricsForEndpoint(ep, result) + metrics.PublishMetricsForEndpoint(ep, result, extraLabels) } UpdateEndpointStatuses(ep, result) if logr.GetThreshold() == logr.LevelDebug && !result.Success { @@ -104,7 +105,7 @@ func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenance logr.Debugf("[watchdog.execute] Waiting for interval=%s before monitoring group=%s endpoint=%s (key=%s) again", ep.Interval, ep.Group, ep.Name, ep.Key()) } -func monitorExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, ctx context.Context) { +func monitorExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, ctx context.Context, extraLabels []string) { ticker := time.NewTicker(ee.Heartbeat.Interval) defer ticker.Stop() for { @@ -113,12 +114,12 @@ func monitorExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingCon logr.Warnf("[watchdog.monitorExternalEndpointHeartbeat] Canceling current execution of group=%s; endpoint=%s; key=%s", ee.Group, ee.Name, ee.Key()) return case <-ticker.C: - executeExternalEndpointHeartbeat(ee, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics) + executeExternalEndpointHeartbeat(ee, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, extraLabels) } } } -func executeExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool) { +func executeExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, extraLabels []string) { if !disableMonitoringLock { // By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which // could cause performance issues and return inaccurate results @@ -152,7 +153,7 @@ func executeExternalEndpointHeartbeat(ee *endpoint.ExternalEndpoint, alertingCon Errors: []string{"heartbeat: no update received within " + ee.Heartbeat.Interval.String()}, } if enabledMetrics { - metrics.PublishMetricsForEndpoint(convertedEndpoint, result) + metrics.PublishMetricsForEndpoint(convertedEndpoint, result, extraLabels) } UpdateEndpointStatuses(convertedEndpoint, result) logr.Infof("[watchdog.monitorExternalEndpointHeartbeat] Checked heartbeat for group=%s; endpoint=%s; key=%s; success=%v; errors=%d; duration=%s", ee.Group, ee.Name, ee.Key(), result.Success, len(result.Errors), result.Duration.Round(time.Millisecond))