diff --git a/.github/assets/slack-alerts.png b/.github/assets/slack-alerts.png new file mode 100644 index 00000000..3f9e3cad Binary files /dev/null and b/.github/assets/slack-alerts.png differ diff --git a/README.md b/README.md index e3588728..41344680 100644 --- a/README.md +++ b/README.md @@ -67,38 +67,40 @@ This example would look like this: ![Simple example](.github/assets/example.png) -Note that you can also add environment variables in the your configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) +Note that you can also add environment variables in the configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) ### Configuration -| Parameter | Description | Default | -| --------------------------------- | --------------------------------------------------------------- | -------------- | -| `metrics` | Whether to expose metrics at /metrics | `false` | -| `services` | List of services to monitor | Required `[]` | -| `services[].name` | Name of the service. Can be anything. | Required `""` | -| `services[].url` | URL to send the request to | Required `""` | -| `services[].conditions` | Conditions used to determine the health of the service | `[]` | -| `services[].interval` | Duration to wait between every status check | `60s` | -| `services[].method` | Request method | `GET` | -| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | -| `services[].body` | Request body | `""` | -| `services[].headers` | Request headers | `{}` | -| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` | -| `services[].alerts[].enabled` | Whether to enable the alert | `false` | -| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | -| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | -| `alerting` | Configuration for alerting | `{}` | -| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | -| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | -| `alerting.twilio.sid` | Twilio account SID | Required `""` | -| `alerting.twilio.token` | Twilio auth token | Required `""` | -| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | -| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | -| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | -| `alerting.custom.url` | Custom alerting request url | `""` | -| `alerting.custom.body` | Custom alerting request body. | `""` | -| `alerting.custom.headers` | Custom alerting request headers | `{}` | +| Parameter | Description | Default | +| -------------------------------------- | --------------------------------------------------------------- | -------------- | +| `debug` | Whether to enable debug logs | `false` | +| `metrics` | Whether to expose metrics at /metrics | `false` | +| `services` | List of services to monitor | Required `[]` | +| `services[].name` | Name of the service. Can be anything. | Required `""` | +| `services[].url` | URL to send the request to | Required `""` | +| `services[].conditions` | Conditions used to determine the health of the service | `[]` | +| `services[].interval` | Duration to wait between every status check | `60s` | +| `services[].method` | Request method | `GET` | +| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | +| `services[].body` | Request body | `""` | +| `services[].headers` | Request headers | `{}` | +| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` | +| `services[].alerts[].enabled` | Whether to enable the alert | `false` | +| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | +| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | +| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` | +| `alerting` | Configuration for alerting | `{}` | +| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | +| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | +| `alerting.twilio.sid` | Twilio account SID | Required `""` | +| `alerting.twilio.token` | Twilio auth token | Required `""` | +| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | +| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | +| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | +| `alerting.custom.url` | Custom alerting request url | `""` | +| `alerting.custom.body` | Custom alerting request body. | `""` | +| `alerting.custom.headers` | Custom alerting request headers | `{}` | ### Conditions @@ -121,7 +123,7 @@ Here are some examples of conditions you can use: ## Docker -Building the Docker image is done as following: +Building the Docker image is done as follows: ``` docker build . -t gatus @@ -194,33 +196,37 @@ services: - type: slack enabled: true description: "healthcheck failed 3 times in a row" + send-on-resolved: true - type: slack enabled: true threshold: 5 description: "healthcheck failed 5 times in a row" + send-on-resolved: true conditions: - "[STATUS] == 200" - "[BODY].status == UP" - "[RESPONSE_TIME] < 300" ``` +Here's an example of what the notifications look like: + +![Slack notifications](.github/assets/slack-alerts.png) + + ### Configuring Twilio alerts ```yaml alerting: twilio: - sid: **** - token: **** - from: +1-234-567-8901 - to: +1-234-567-8901 + sid: "..." + token: "..." + from: "+1-234-567-8901" + to: "+1-234-567-8901" services: - name: twinnation interval: 30s url: "https://twinnation.org/health" alerts: - - type: twilio - enabled: true - description: "healthcheck failed 3 times in a row" - type: twilio enabled: true threshold: 5 diff --git a/config/config.go b/config/config.go index 23638975..2ba7fe9d 100644 --- a/config/config.go +++ b/config/config.go @@ -22,6 +22,7 @@ var ( type Config struct { Metrics bool `yaml:"metrics"` + Debug bool `yaml:"debug"` Alerting *core.AlertingConfig `yaml:"alerting"` Services []*core.Service `yaml:"services"` } diff --git a/core/alerting.go b/core/alerting.go index 530a6dda..5f595f13 100644 --- a/core/alerting.go +++ b/core/alerting.go @@ -2,9 +2,11 @@ package core import ( "bytes" + "encoding/base64" "fmt" "github.com/TwinProduction/gatus/client" "net/http" + "net/url" "strings" ) @@ -70,3 +72,64 @@ func (provider *CustomAlertProvider) Send(serviceName, alertDescription string) } return nil } + +func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider { + var message string + var color string + if resolved { + message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow) + color = "#36A64F" + } else { + message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name) + color = "#DD0000" + } + var results string + for _, conditionResult := range result.ConditionResults { + var prefix string + if conditionResult.Success { + prefix = ":heavy_check_mark:" + } else { + prefix = ":x:" + } + results += fmt.Sprintf("%s - `%s`\n", prefix, conditionResult.Condition) + } + return &CustomAlertProvider{ + Url: slackWebHookUrl, + Method: "POST", + Body: fmt.Sprintf(`{ + "text": "", + "attachments": [ + { + "title": ":helmet_with_white_cross: Gatus", + "text": "%s:\n> %s", + "short": false, + "color": "%s", + "fields": [ + { + "title": "Condition results", + "value": "%s", + "short": false + } + ] + }, + ] +}`, message, alert.Description, color, results), + Headers: map[string]string{"Content-Type": "application/json"}, + } +} + +func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message string) *CustomAlertProvider { + return &CustomAlertProvider{ + Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", provider.SID), + Method: "POST", + Body: url.Values{ + "To": {provider.To}, + "From": {provider.From}, + "Body": {message}, + }.Encode(), + Headers: map[string]string{ + "Content-Type": "application/x-www-form-urlencoded", + "Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", provider.SID, provider.Token)))), + }, + } +} diff --git a/main.go b/main.go index 23d72080..ba8fe163 100644 --- a/main.go +++ b/main.go @@ -3,7 +3,6 @@ package main import ( "bytes" "compress/gzip" - "encoding/json" "github.com/TwinProduction/gatus/config" "github.com/TwinProduction/gatus/watchdog" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -53,12 +52,11 @@ func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) { if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired { buffer := &bytes.Buffer{} gzipWriter := gzip.NewWriter(buffer) - serviceResults := watchdog.GetServiceResults() - data, err := json.Marshal(serviceResults) + data, err := watchdog.GetJsonEncodedServiceResults() if err != nil { - log.Printf("[main][serviceResultsHandler] Unable to marshall object to JSON: %s", err.Error()) + log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error()) writer.WriteHeader(http.StatusInternalServerError) - _, _ = writer.Write([]byte("Unable to marshall object to JSON")) + _, _ = writer.Write([]byte("Unable to marshal object to JSON")) return } gzipWriter.Write(data) diff --git a/watchdog/watchdog.go b/watchdog/watchdog.go index df8403cf..c237dcfa 100644 --- a/watchdog/watchdog.go +++ b/watchdog/watchdog.go @@ -1,25 +1,34 @@ package watchdog import ( - "encoding/base64" + "encoding/json" "fmt" "github.com/TwinProduction/gatus/config" "github.com/TwinProduction/gatus/core" "github.com/TwinProduction/gatus/metric" "log" - "net/url" "sync" "time" ) var ( serviceResults = make(map[string][]*core.Result) - rwLock sync.RWMutex + + // serviceResultsMutex is used to prevent concurrent map access + serviceResultsMutex sync.RWMutex + + // monitoringMutex is used to prevent multiple services from being evaluated at the same time. + // Without this, conditions using response time may become inaccurate. + monitoringMutex sync.Mutex ) -// GetServiceResults returns a list of the last 20 results for each services -func GetServiceResults() *map[string][]*core.Result { - return &serviceResults +// GetJsonEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal. +// The reason why the encoding is done here is because we use a mutex to prevent concurrent map access. +func GetJsonEncodedServiceResults() ([]byte, error) { + serviceResultsMutex.RLock() + data, err := json.Marshal(serviceResults) + serviceResultsMutex.RUnlock() + return data, err } // Monitor loops over each services and starts a goroutine to monitor each services separately @@ -33,33 +42,39 @@ func Monitor(cfg *config.Config) { // monitor monitors a single service in a loop func monitor(service *core.Service) { + cfg := config.Get() for { // By placing the lock here, we prevent multiple services from being monitored at the exact same time, which // could cause performance issues and return inaccurate results - rwLock.Lock() - log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) + monitoringMutex.Lock() + if cfg.Debug { + log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) + } result := service.EvaluateConditions() metric.PublishMetricsForService(service, result) + serviceResultsMutex.Lock() serviceResults[service.Name] = append(serviceResults[service.Name], result) if len(serviceResults[service.Name]) > 20 { serviceResults[service.Name] = serviceResults[service.Name][1:] } - rwLock.Unlock() + serviceResultsMutex.Unlock() var extra string if !result.Success { extra = fmt.Sprintf("responseBody=%s", result.Body) } log.Printf( - "[watchdog][monitor] Finished monitoring serviceName=%s; errors=%d; requestDuration=%s; %s", + "[watchdog][monitor] Monitored serviceName=%s; success=%v; errors=%d; requestDuration=%s; %s", service.Name, + result.Success, len(result.Errors), result.Duration.Round(time.Millisecond), extra, ) - handleAlerting(service, result) - - log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s", service.Interval, service.Name) + if cfg.Debug { + log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name) + } + monitoringMutex.Unlock() time.Sleep(service.Interval) } } @@ -72,10 +87,43 @@ func handleAlerting(service *core.Service, result *core.Result) { if result.Success { if service.NumberOfFailuresInARow > 0 { for _, alert := range service.Alerts { - if !alert.Enabled || !alert.SendOnResolved || alert.Threshold < service.NumberOfFailuresInARow { + if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow { continue } - // TODO + var alertProvider *core.CustomAlertProvider + if alert.Type == core.SlackAlert { + if len(cfg.Alerting.Slack) > 0 { + log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been resolved", alert.Description) + alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true) + } else { + log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") + } + } else if alert.Type == core.TwilioAlert { + if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() { + log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alert.Description) + alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("%s - %s", service.Name, alert.Description)) + } else { + log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because Twilio isn't configured properly'") + } + } else if alert.Type == core.CustomAlert { + if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() { + log.Printf("[watchdog][monitor] Sending custom alert because alert with description=%s has been triggered", alert.Description) + alertProvider = &core.CustomAlertProvider{ + Url: cfg.Alerting.Custom.Url, + Method: cfg.Alerting.Custom.Method, + Body: cfg.Alerting.Custom.Body, + Headers: cfg.Alerting.Custom.Headers, + } + } else { + log.Printf("[watchdog][monitor] Not sending custom alert despite being triggered, because there is no custom url configured") + } + } + if alertProvider != nil { + err := alertProvider.Send(service.Name, alert.Description) + if err != nil { + log.Printf("[watchdog][monitor] Ran into error sending an alert: %s", err.Error()) + } + } } } service.NumberOfFailuresInARow = 0 @@ -90,33 +138,16 @@ func handleAlerting(service *core.Service, result *core.Result) { if alert.Type == core.SlackAlert { if len(cfg.Alerting.Slack) > 0 { log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been triggered", alert.Description) - alertProvider = &core.CustomAlertProvider{ - Url: cfg.Alerting.Slack, - Method: "POST", - Body: fmt.Sprintf(`{"text":"*[Gatus]*\n*service:* %s\n*description:* %s"}`, service.Name, alert.Description), - Headers: map[string]string{"Content-Type": "application/json"}, - } + alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false) } else { log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") } } else if alert.Type == core.TwilioAlert { if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() { log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alert.Description) - alertProvider = &core.CustomAlertProvider{ - Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", cfg.Alerting.Twilio.SID), - Method: "POST", - Body: url.Values{ - "To": {cfg.Alerting.Twilio.To}, - "From": {cfg.Alerting.Twilio.From}, - "Body": {fmt.Sprintf("%s - %s", service.Name, alert.Description)}, - }.Encode(), - Headers: map[string]string{ - "Content-Type": "application/x-www-form-urlencoded", - "Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", cfg.Alerting.Twilio.SID, cfg.Alerting.Twilio.Token)))), - }, - } + alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("%s - %s", service.Name, alert.Description)) } else { - log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because twilio config settings missing") + log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because Twilio config settings missing") } } else if alert.Type == core.CustomAlert { if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {