Prometheus absence detector for GKE instances.

  * Adds tool that generates Prometheus alerts if there is no data
for a given alert.
  * Also adds a project: label to each alert, which helps generating
good links to logs.
  * Moves some alerts between general/public/corp to rationalize things.

Bug: skia:
Change-Id: I13b4863aefc1130c23c93f74c4d3047b136a108e
Reviewed-on: https://skia-review.googlesource.com/c/177725
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/.gitignore b/.gitignore
index 297e904..b069925 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,6 +102,7 @@
 # promk
 promk/tmp
 promk/tmpconf
+promk/prometheus/absent_*.yml
 # kube
 **/chat_config.txt
 .vscode
diff --git a/promk/Makefile b/promk/Makefile
index 24182cb..75d7f27 100644
--- a/promk/Makefile
+++ b/promk/Makefile
@@ -1,9 +1,22 @@
+
 .PHONY: validate push_config
-validate: ./tmp/promtool
+validate: ./tmp/promtool absent
+	promk-absent --input=prometheus/alerts_alert_to_pubsub.yml --output=prometheus/absent_alerts_to_pubsub.yml
+	promk-absent --input=prometheus/alerts_corp.yml --output=prometheus/absent_alerts_corp.yml
+	promk-absent --input=prometheus/alerts_general.yml --output=prometheus/absent_alerts_general.yml
+	promk-absent --input=prometheus/alerts_public.yml --output=prometheus/absent_alerts_public.yml
 	# If the only error is "FAILED: error checking bearer token file..." then you should be good.
 	-./tmp/promtool check config ./prometheus/prometheus-public.yml
 	-./tmp/promtool check config ./prometheus/prometheus-corp.yml
 	./tmp/promtool check rules ./prometheus/alerts_*.yml
+	./tmp/promtool check rules ./prometheus/absent_*.yml
+
+.PHONY: absent
+absent:
+	go install ./go/promk-absent
+
+testgo:
+	go test ./go/...
 
 ./tmp/promtool:
 	-mkdir -p tmp
@@ -16,6 +29,7 @@
 	mkdir ./tmpconf
 	cp prometheus/prometheus-public.yml ./tmpconf/prometheus.yml
 	cp prometheus/alerts_*.yml ./tmpconf
+	cp prometheus/absent_alerts_*.yml ./tmpconf
 	# Need to use replace so that the configmap gets updated. Change "replace"
 	# to "create" if this is the first time the configmap has been uploaded.
 	kubectl create configmap prometheus-server-conf --from-file=./tmpconf -o yaml --dry-run | kubectl replace -f -
@@ -27,6 +41,7 @@
 	mkdir ./tmpconf
 	cp prometheus/prometheus-corp.yml ./tmpconf/prometheus.yml
 	cp prometheus/alerts_*.yml ./tmpconf
+	cp prometheus/absent_alerts_*.yml ./tmpconf
 	kubectl create configmap prometheus-server-conf --from-file=./tmpconf -o yaml --dry-run | kubectl replace -f -
 
 include ../make/clusters.mk
diff --git a/promk/go/promk-absent/main.go b/promk/go/promk-absent/main.go
new file mode 100644
index 0000000..75d84d2
--- /dev/null
+++ b/promk/go/promk-absent/main.go
@@ -0,0 +1,115 @@
+// An application to create a new set of alerts from an existing set of alerts.
+//
+// The new alerts detect if no data is present for the associated alert.
+//
+// Presumes that all expressions are written in the form of:
+//
+//    equation [<>!=]+ (some constant)
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"regexp"
+	"strings"
+
+	"go.skia.org/infra/go/common"
+	"go.skia.org/infra/go/sklog"
+	yaml "gopkg.in/yaml.v2"
+)
+
+// Alerts - Struct for parsing the yaml format of Prometheus alerts.
+type Alerts struct {
+	Groups []Group `yaml:"groups"`
+}
+
+type Group struct {
+	Name  string `yaml:"name"`
+	Rules []Rule `yaml:"rules"`
+}
+
+type Rule struct {
+	Alert       string            `yaml:"alert"`
+	Expr        string            `yaml:"expr"`
+	Labels      map[string]string `yaml:"labels"`
+	Annotations map[string]string `yaml:"annotations"`
+}
+
+// flags
+var (
+	input  = flag.String("input", "", "Name of file to read.")
+	output = flag.String("output", "", "Name of file to write.")
+)
+
+var (
+	atComparison = regexp.MustCompile("[<>=!]+")
+)
+
+// Reverse a string.
+//
+// https://github.com/golang/example/blob/master/stringutil/reverse.go
+func Reverse(s string) string {
+	r := []rune(s)
+	for i, j := 0, len(r)-1; i < len(r)/2; i, j = i+1, j-1 {
+		r[i], r[j] = r[j], r[i]
+	}
+	return string(r)
+}
+
+func equationFromExpr(expr string) string {
+	if expr == "" {
+		return ""
+	}
+	return strings.TrimSpace(Reverse(atComparison.Split(Reverse(expr), 2)[1]))
+}
+
+func main() {
+	common.Init()
+	b, err := ioutil.ReadFile(*input)
+	if err != nil {
+		sklog.Fatal(err)
+	}
+	var alerts Alerts
+	if err := yaml.Unmarshal(b, &alerts); err != nil {
+		sklog.Fatal(err)
+	}
+
+	absent := Alerts{
+		Groups: []Group{},
+	}
+
+	for _, g := range alerts.Groups {
+		rules := []Rule{}
+		for _, rule := range g.Rules {
+			equation := equationFromExpr(rule.Expr)
+			if equation == "" {
+				sklog.Fatalf("Failed to extract an eqation for %q", rule.Alert)
+			}
+			rules = append(rules, Rule{
+				Alert: "Absent",
+				Expr:  fmt.Sprintf("absent(%s)", equation),
+				Labels: map[string]string{
+					"category": "infra",
+					"severify": "critical",
+				},
+				Annotations: map[string]string{
+					"abbr":        rule.Alert,
+					"description": fmt.Sprintf("There is no data for the Alert: %q", rule.Alert),
+				},
+			})
+		}
+		absent.Groups = append(absent.Groups, Group{
+			Name:  g.Name,
+			Rules: rules,
+		})
+	}
+
+	b, err = yaml.Marshal(absent)
+	if err != nil {
+		sklog.Fatal(err)
+	}
+	if err := ioutil.WriteFile(*output, b, 0664); err != nil {
+		sklog.Fatal(err)
+	}
+}
diff --git a/promk/go/promk-absent/main_test.go b/promk/go/promk-absent/main_test.go
new file mode 100644
index 0000000..c8a16ef
--- /dev/null
+++ b/promk/go/promk-absent/main_test.go
@@ -0,0 +1,49 @@
+package main
+
+import (
+	"testing"
+
+	"go.skia.org/infra/go/testutils"
+)
+
+func TestEquationFromExpr(t *testing.T) {
+	testutils.SmallTest(t)
+
+	testCases := []struct {
+		value    string
+		expected string
+		message  string
+	}{
+		{
+			value:    "up == 0",
+			expected: "up",
+			message:  "==",
+		},
+		{
+			value:    "liveness_ci_pubsub_receive_s > 60 * 60 * 24 * 2",
+			expected: "liveness_ci_pubsub_receive_s",
+			message:  ">",
+		},
+		{
+			value:    "cq_watcher_in_flight_waiting_in_cq{app=\"cq-watcher\"} >= 10",
+			expected: "cq_watcher_in_flight_waiting_in_cq{app=\"cq-watcher\"}",
+			message:  "{app=...}",
+		},
+		{
+			value:    "healthy{app=\"ct-master\"} != 1",
+			expected: "healthy{app=\"ct-master\"}",
+			message:  "!",
+		},
+		{
+			value:    "",
+			expected: "",
+			message:  "empty string",
+		},
+	}
+
+	for _, tc := range testCases {
+		if got, want := equationFromExpr(tc.value), tc.expected; got != want {
+			t.Errorf("Failed case Got %v Want %v: %s", got, want, tc.message)
+		}
+	}
+}
diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 215a886..4166cdc 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml
@@ -17,7 +17,7 @@
 
           kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
 
-          https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}"
+          https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
           '
 
   - alert: CrashLoop
@@ -33,7 +33,7 @@
 
           kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
 
-          https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}"
+          https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
         '
 
   - alert: TooManyGoRoutines
@@ -49,7 +49,7 @@
 
           kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
 
-          https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}"
+          https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
         '
 
   - alert: TooManyOpenFDs
@@ -80,12 +80,14 @@
       description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.'
 
   - alert: AutoRollBackendErrorRate
-    expr: rate(num_log_lines{level="ERROR",log_source="autoroll-be"}[1h]) > 0.001
+    expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001
     labels:
       category: infra
       severity: critical
     annotations:
-      description: 'The error rate for autoroll on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
+      description: 'The error rate for autoroll on {{ $labels.app }} is too high.
+      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
+      https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
 
   - alert: AutoRollLastTransition
     expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 10*60
@@ -93,7 +95,9 @@
       category: infra
       severity: critical
     annotations:
-      description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 10 minutes. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fautoroll'
+      description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 10 minutes.
+      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
+      '
 
   - alert: HighExternalQPS
     expr: sum(rate(http_request_metrics{host!="www.googleapis.com"}[30m])) by (host) > 25
@@ -111,16 +115,6 @@
     annotations:
       description: 'QPS to www.googleapis.com is high. Verify that this is expected.'
 
-# skia-flutter-autoroll takes a long time to transition because it's pre-upload
-# scripts run flutter's license script which can take around 20 minutes.
-  - alert: AutoRollLastTransition
-    expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 30*60
-    labels:
-      category: infra
-      severity: critical
-    annotations:
-      description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 30 minutes. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fautoroll'
-
   - alert: AutoRollGetSheriffFailed
     expr: autoroll_get_sheriff_success == 0
     for: 2h
@@ -128,4 +122,6 @@
       category: infra
       severity: critical
     annotations:
-      description: 'Autoroll on {{ $labels.instance }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fautoroll'
+      description: 'Autoroll on {{ $labels.instance }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty.
+      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
+      '
diff --git a/promk/prometheus/alerts_public.yml b/promk/prometheus/alerts_public.yml
index 7f2034b..cdd6a28 100644
--- a/promk/prometheus/alerts_public.yml
+++ b/promk/prometheus/alerts_public.yml
@@ -205,12 +205,26 @@
       description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency'
 
   - alert: AutoRollFrontendErrorRate
-    expr: rate(num_log_lines{level="ERROR",log_source="autoroll-fe"}[1h]) > 0.001
+    expr: rate(num_log_lines{level="ERROR",app=~"autoroll-fe.*"}[1h]) > 0.001
     labels:
       category: infra
       severity: critical
     annotations:
-      description: 'The error rate for autoroll on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
+      description: 'The error rate for autoroll on {{ $labels.instance }} is too high.
+      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
+      https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
+
+# skia-flutter-autoroll takes a long time to transition because it's pre-upload
+# scripts run flutter's license script which can take around 20 minutes.
+  - alert: AutoRollLastTransition
+    expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 30*60
+    labels:
+      category: infra
+      severity: critical
+    annotations:
+      description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 30 minutes.
+      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
+      '
 
 # Perf
   - alert: AndroidIngestFailures
diff --git a/promk/prometheus/prometheus-corp.yml b/promk/prometheus/prometheus-corp.yml
index 97dbe5a..c36d427 100644
--- a/promk/prometheus/prometheus-corp.yml
+++ b/promk/prometheus/prometheus-corp.yml
@@ -1,11 +1,16 @@
 global:
   scrape_interval: 15s
   evaluation_interval: 15s
+  external_labels:
+    project: skia-corp
 
 rule_files:
   - "alerts_corp.yml"
   - "alerts_general.yml"
   - "alerts_alert_to_pubsub.yml"
+  - "absent_alerts_corp.yml"
+  - "absent_alerts_general.yml"
+  - "absent_alerts_alert_to_pubsub.yml"
 
 alerting:
   alertmanagers:
diff --git a/promk/prometheus/prometheus-public.yml b/promk/prometheus/prometheus-public.yml
index afb6d7a..ac50d47 100644
--- a/promk/prometheus/prometheus-public.yml
+++ b/promk/prometheus/prometheus-public.yml
@@ -1,11 +1,16 @@
 global:
   scrape_interval: 15s
   evaluation_interval: 15s
+  external_labels:
+    project: skia-public
 
 rule_files:
   - "alerts_public.yml"
   - "alerts_general.yml"
   - "alerts_alert_to_pubsub.yml"
+  - "absent_alerts_public.yml"
+  - "absent_alerts_general.yml"
+  - "absent_alerts_alert_to_pubsub.yml"
 
 alerting:
   alertmanagers: