Prometheus absence detector for GKE instances.
* Adds tool that generates Prometheus alerts if there is no data
for a given alert.
* Also adds a project: label to each alert, which helps generating
good links to logs.
* Moves some alerts between general/public/corp to rationalize things.
Bug: skia:
Change-Id: I13b4863aefc1130c23c93f74c4d3047b136a108e
Reviewed-on: https://skia-review.googlesource.com/c/177725
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/.gitignore b/.gitignore
index 297e904..b069925 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,6 +102,7 @@
# promk
promk/tmp
promk/tmpconf
+promk/prometheus/absent_*.yml
# kube
**/chat_config.txt
.vscode
diff --git a/promk/Makefile b/promk/Makefile
index 24182cb..75d7f27 100644
--- a/promk/Makefile
+++ b/promk/Makefile
@@ -1,9 +1,22 @@
+
.PHONY: validate push_config
-validate: ./tmp/promtool
+validate: ./tmp/promtool absent
+ promk-absent --input=prometheus/alerts_alert_to_pubsub.yml --output=prometheus/absent_alerts_to_pubsub.yml
+ promk-absent --input=prometheus/alerts_corp.yml --output=prometheus/absent_alerts_corp.yml
+ promk-absent --input=prometheus/alerts_general.yml --output=prometheus/absent_alerts_general.yml
+ promk-absent --input=prometheus/alerts_public.yml --output=prometheus/absent_alerts_public.yml
# If the only error is "FAILED: error checking bearer token file..." then you should be good.
-./tmp/promtool check config ./prometheus/prometheus-public.yml
-./tmp/promtool check config ./prometheus/prometheus-corp.yml
./tmp/promtool check rules ./prometheus/alerts_*.yml
+ ./tmp/promtool check rules ./prometheus/absent_*.yml
+
+.PHONY: absent
+absent:
+ go install ./go/promk-absent
+
+testgo:
+ go test ./go/...
./tmp/promtool:
-mkdir -p tmp
@@ -16,6 +29,7 @@
mkdir ./tmpconf
cp prometheus/prometheus-public.yml ./tmpconf/prometheus.yml
cp prometheus/alerts_*.yml ./tmpconf
+ cp prometheus/absent_alerts_*.yml ./tmpconf
# Need to use replace so that the configmap gets updated. Change "replace"
# to "create" if this is the first time the configmap has been uploaded.
kubectl create configmap prometheus-server-conf --from-file=./tmpconf -o yaml --dry-run | kubectl replace -f -
@@ -27,6 +41,7 @@
mkdir ./tmpconf
cp prometheus/prometheus-corp.yml ./tmpconf/prometheus.yml
cp prometheus/alerts_*.yml ./tmpconf
+ cp prometheus/absent_alerts_*.yml ./tmpconf
kubectl create configmap prometheus-server-conf --from-file=./tmpconf -o yaml --dry-run | kubectl replace -f -
include ../make/clusters.mk
diff --git a/promk/go/promk-absent/main.go b/promk/go/promk-absent/main.go
new file mode 100644
index 0000000..75d84d2
--- /dev/null
+++ b/promk/go/promk-absent/main.go
@@ -0,0 +1,115 @@
+// An application to create a new set of alerts from an existing set of alerts.
+//
+// The new alerts detect if no data is present for the associated alert.
+//
+// Presumes that all expressions are written in the form of:
+//
+// equation [<>!=]+ (some constant)
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "regexp"
+ "strings"
+
+ "go.skia.org/infra/go/common"
+ "go.skia.org/infra/go/sklog"
+ yaml "gopkg.in/yaml.v2"
+)
+
+// Alerts - Struct for parsing the yaml format of Prometheus alerts.
+type Alerts struct {
+ Groups []Group `yaml:"groups"`
+}
+
+type Group struct {
+ Name string `yaml:"name"`
+ Rules []Rule `yaml:"rules"`
+}
+
+type Rule struct {
+ Alert string `yaml:"alert"`
+ Expr string `yaml:"expr"`
+ Labels map[string]string `yaml:"labels"`
+ Annotations map[string]string `yaml:"annotations"`
+}
+
+// flags
+var (
+ input = flag.String("input", "", "Name of file to read.")
+ output = flag.String("output", "", "Name of file to write.")
+)
+
+var (
+ atComparison = regexp.MustCompile("[<>=!]+")
+)
+
+// Reverse a string.
+//
+// https://github.com/golang/example/blob/master/stringutil/reverse.go
+func Reverse(s string) string {
+ r := []rune(s)
+ for i, j := 0, len(r)-1; i < len(r)/2; i, j = i+1, j-1 {
+ r[i], r[j] = r[j], r[i]
+ }
+ return string(r)
+}
+
+func equationFromExpr(expr string) string {
+ if expr == "" {
+ return ""
+ }
+ return strings.TrimSpace(Reverse(atComparison.Split(Reverse(expr), 2)[1]))
+}
+
+func main() {
+ common.Init()
+ b, err := ioutil.ReadFile(*input)
+ if err != nil {
+ sklog.Fatal(err)
+ }
+ var alerts Alerts
+ if err := yaml.Unmarshal(b, &alerts); err != nil {
+ sklog.Fatal(err)
+ }
+
+ absent := Alerts{
+ Groups: []Group{},
+ }
+
+ for _, g := range alerts.Groups {
+ rules := []Rule{}
+ for _, rule := range g.Rules {
+ equation := equationFromExpr(rule.Expr)
+ if equation == "" {
+ sklog.Fatalf("Failed to extract an eqation for %q", rule.Alert)
+ }
+ rules = append(rules, Rule{
+ Alert: "Absent",
+ Expr: fmt.Sprintf("absent(%s)", equation),
+ Labels: map[string]string{
+ "category": "infra",
+ "severify": "critical",
+ },
+ Annotations: map[string]string{
+ "abbr": rule.Alert,
+ "description": fmt.Sprintf("There is no data for the Alert: %q", rule.Alert),
+ },
+ })
+ }
+ absent.Groups = append(absent.Groups, Group{
+ Name: g.Name,
+ Rules: rules,
+ })
+ }
+
+ b, err = yaml.Marshal(absent)
+ if err != nil {
+ sklog.Fatal(err)
+ }
+ if err := ioutil.WriteFile(*output, b, 0664); err != nil {
+ sklog.Fatal(err)
+ }
+}
diff --git a/promk/go/promk-absent/main_test.go b/promk/go/promk-absent/main_test.go
new file mode 100644
index 0000000..c8a16ef
--- /dev/null
+++ b/promk/go/promk-absent/main_test.go
@@ -0,0 +1,49 @@
+package main
+
+import (
+ "testing"
+
+ "go.skia.org/infra/go/testutils"
+)
+
+func TestEquationFromExpr(t *testing.T) {
+ testutils.SmallTest(t)
+
+ testCases := []struct {
+ value string
+ expected string
+ message string
+ }{
+ {
+ value: "up == 0",
+ expected: "up",
+ message: "==",
+ },
+ {
+ value: "liveness_ci_pubsub_receive_s > 60 * 60 * 24 * 2",
+ expected: "liveness_ci_pubsub_receive_s",
+ message: ">",
+ },
+ {
+ value: "cq_watcher_in_flight_waiting_in_cq{app=\"cq-watcher\"} >= 10",
+ expected: "cq_watcher_in_flight_waiting_in_cq{app=\"cq-watcher\"}",
+ message: "{app=...}",
+ },
+ {
+ value: "healthy{app=\"ct-master\"} != 1",
+ expected: "healthy{app=\"ct-master\"}",
+ message: "!",
+ },
+ {
+ value: "",
+ expected: "",
+ message: "empty string",
+ },
+ }
+
+ for _, tc := range testCases {
+ if got, want := equationFromExpr(tc.value), tc.expected; got != want {
+ t.Errorf("Failed case Got %v Want %v: %s", got, want, tc.message)
+ }
+ }
+}
diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 215a886..4166cdc 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml
@@ -17,7 +17,7 @@
kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
- https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}"
+ https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
'
- alert: CrashLoop
@@ -33,7 +33,7 @@
kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
- https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}"
+ https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
'
- alert: TooManyGoRoutines
@@ -49,7 +49,7 @@
kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
- https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}"
+ https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
'
- alert: TooManyOpenFDs
@@ -80,12 +80,14 @@
description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.'
- alert: AutoRollBackendErrorRate
- expr: rate(num_log_lines{level="ERROR",log_source="autoroll-be"}[1h]) > 0.001
+ expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001
labels:
category: infra
severity: critical
annotations:
- description: 'The error rate for autoroll on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
+ description: 'The error rate for autoroll on {{ $labels.app }} is too high.
+ https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
+ https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
- alert: AutoRollLastTransition
expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 10*60
@@ -93,7 +95,9 @@
category: infra
severity: critical
annotations:
- description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 10 minutes. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fautoroll'
+ description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 10 minutes.
+ https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
+ '
- alert: HighExternalQPS
expr: sum(rate(http_request_metrics{host!="www.googleapis.com"}[30m])) by (host) > 25
@@ -111,16 +115,6 @@
annotations:
description: 'QPS to www.googleapis.com is high. Verify that this is expected.'
-# skia-flutter-autoroll takes a long time to transition because it's pre-upload
-# scripts run flutter's license script which can take around 20 minutes.
- - alert: AutoRollLastTransition
- expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 30*60
- labels:
- category: infra
- severity: critical
- annotations:
- description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 30 minutes. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fautoroll'
-
- alert: AutoRollGetSheriffFailed
expr: autoroll_get_sheriff_success == 0
for: 2h
@@ -128,4 +122,6 @@
category: infra
severity: critical
annotations:
- description: 'Autoroll on {{ $labels.instance }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fautoroll'
+ description: 'Autoroll on {{ $labels.instance }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty.
+ https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
+ '
diff --git a/promk/prometheus/alerts_public.yml b/promk/prometheus/alerts_public.yml
index 7f2034b..cdd6a28 100644
--- a/promk/prometheus/alerts_public.yml
+++ b/promk/prometheus/alerts_public.yml
@@ -205,12 +205,26 @@
description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency'
- alert: AutoRollFrontendErrorRate
- expr: rate(num_log_lines{level="ERROR",log_source="autoroll-fe"}[1h]) > 0.001
+ expr: rate(num_log_lines{level="ERROR",app=~"autoroll-fe.*"}[1h]) > 0.001
labels:
category: infra
severity: critical
annotations:
- description: 'The error rate for autoroll on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
+ description: 'The error rate for autoroll on {{ $labels.instance }} is too high.
+ https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
+ https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
+
+# skia-flutter-autoroll takes a long time to transition because it's pre-upload
+# scripts run flutter's license script which can take around 20 minutes.
+ - alert: AutoRollLastTransition
+ expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 30*60
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 30 minutes.
+ https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
+ '
# Perf
- alert: AndroidIngestFailures
diff --git a/promk/prometheus/prometheus-corp.yml b/promk/prometheus/prometheus-corp.yml
index 97dbe5a..c36d427 100644
--- a/promk/prometheus/prometheus-corp.yml
+++ b/promk/prometheus/prometheus-corp.yml
@@ -1,11 +1,16 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
+ external_labels:
+ project: skia-corp
rule_files:
- "alerts_corp.yml"
- "alerts_general.yml"
- "alerts_alert_to_pubsub.yml"
+ - "absent_alerts_corp.yml"
+ - "absent_alerts_general.yml"
+ - "absent_alerts_alert_to_pubsub.yml"
alerting:
alertmanagers:
diff --git a/promk/prometheus/prometheus-public.yml b/promk/prometheus/prometheus-public.yml
index afb6d7a..ac50d47 100644
--- a/promk/prometheus/prometheus-public.yml
+++ b/promk/prometheus/prometheus-public.yml
@@ -1,11 +1,16 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
+ external_labels:
+ project: skia-public
rule_files:
- "alerts_public.yml"
- "alerts_general.yml"
- "alerts_alert_to_pubsub.yml"
+ - "absent_alerts_public.yml"
+ - "absent_alerts_general.yml"
+ - "absent_alerts_alert_to_pubsub.yml"
alerting:
alertmanagers: