[thanos] Add monitoring for thanos Change-Id: Iecf67202004f795272d03ecca6632e0bc17094d0 Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/264558 Reviewed-by: Joe Gregorio <jcgregorio@google.com> Commit-Queue: Joe Gregorio <jcgregorio@google.com>

commit: b0fb0362efcbe58e7f744514b288f3d760d3a7f3 [log] [tgz]
author: Joe Gregorio <jcgregorio@google.com> Wed Jan 15 09:25:24 2020 -0500
committer: Joe Gregorio <jcgregorio@google.com> Wed Jan 15 14:26:08 2020 +0000
tree: 11dcb562b3d3fff68ab33cfcd94d67c4ea53e609
parent: 5a675012b1087727fa419e10e7f3833d86a08587 [diff]
diff --git a/promk/Makefile b/promk/Makefile
index 776b5c2..60b314f 100644
--- a/promk/Makefile
+++ b/promk/Makefile

@@ -5,8 +5,10 @@
 	promk-absent --input=prometheus/alerts_corp.yml --output=prometheus/absent_alerts_corp.yml
 	promk-absent --input=prometheus/alerts_general.yml --output=prometheus/absent_alerts_general.yml
 	promk-absent --input=prometheus/alerts_public.yml --output=prometheus/absent_alerts_public.yml
+	promk-absent --input=prometheus/alerts_thanos.yml --output=prometheus/absent_alerts_thanos.yml
 	-./tmp/promtool check config ./prometheus/prometheus-public.yml
 	-./tmp/promtool check config ./prometheus/prometheus-corp.yml
+	-./tmp/promtool check config ./prometheus/prometheus-rack4.yml
 	./tmp/promtool check rules ./prometheus/alerts_*.yml
 	./tmp/promtool check rules ./prometheus/absent_*.yml
 	# Please check output above for FAILED. If the only error is "FAILED: error checking bearer
@@ -40,6 +42,8 @@
 	-rm -rf ./tmpconf
 	mkdir ./tmpconf
 	cp prometheus/prometheus-public.yml ./tmpconf/prometheus.yml
+	cp prometheus/alerts_thanos.yml ./tmpconf/
+	cp prometheus/absent_alerts_thanos.yml ./tmpconf/
 	# Need to use replace so that the configmap gets updated. Change "replace"
 	# to "create" if this is the first time the configmap has been uploaded.
 	kubectl create configmap prometheus-server-conf --from-file=./tmpconf -o yaml --dry-run | kubectl replace -f -

diff --git a/promk/prometheus/alerts_thanos.yml b/promk/prometheus/alerts_thanos.yml
new file mode 100644
index 0000000..f4786f1
--- /dev/null
+++ b/promk/prometheus/alerts_thanos.yml

@@ -0,0 +1,33 @@
+# Alerts for Thanos, which we fire from both thanos and the skia-public
+# prometheus instance for safety.
+#
+groups:
+- name: thanos
+  rules:
+  - alert: PrometheusRuleEvaluationFailures
+    expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
+    for: 5m
+    labels:
+      category: infra
+      severity: critical
+    annotations:
+      abbr: '{{ $labels.instance }}'
+      description: 'Thanos/Prometheus is failing to evaluate rules, check {{ $labels.kubernetes_pod_name }} pod logs'
+  - alert: ThanosRuleIsDroppingAlerts
+    expr: rate(thanos_alert_queue_alerts_dropped_total[5m]) > 0
+    for: 5m
+    labels:
+      category: infra
+      severity: critical
+    annotations:
+      abbr: '{{ $labels.instance }}'
+      description: 'Thanos Rule is dropping alerts, check {{ $labels.kubernetes_pod_name }} pod logs'
+  - alert: ThanosRuleGrpcErrorRate
+    expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable"}[5m]) > 0
+    for: 5m
+    labels:
+      category: infra
+      severity: critical
+    annotations:
+      abbr: '{{ $labels.instance }}'
+      description: Thanos Rule is returning Internal/Unavailable errors, check {{ $labels.kubernetes_pod_name }} pod logs.
\ No newline at end of file

diff --git a/promk/prometheus/prometheus-public.yml b/promk/prometheus/prometheus-public.yml
index b25397f..a66fa8f 100644
--- a/promk/prometheus/prometheus-public.yml
+++ b/promk/prometheus/prometheus-public.yml

@@ -4,14 +4,27 @@
   external_labels:
     cluster: skia-public
 
+rule_files:
+  - "alerts_thanos.yml"
+  - "absent_alerts_thanos.yml"
+
+alerting:
+  alertmanagers:
+    - static_configs:
+      - targets:
+        - alert-to-pubsub:8000
+
 scrape_configs:
   # Scrape configs for pods that run multiple containers. If a pod doesn't
   # run multiple containers then just use annotations to get it scraped.
   # Otherwise add the second, third, fourth, etc. port here.
   - job_name: 'auth-proxy'
     static_configs:
+      - targets: ['prometheus:9000']
       - targets: ['prometheus:10000']
       - targets: ['collectd-exporter:10000']
+      - targets: ['thanos-query:10000']
+      - targets: ['thanos-rule:10000']
 
   - job_name: 'kubernetes-apiservers'
commit	b0fb0362efcbe58e7f744514b288f3d760d3a7f3	[log] [tgz]
author	Joe Gregorio <jcgregorio@google.com>	Wed Jan 15 09:25:24 2020 -0500
committer	Joe Gregorio <jcgregorio@google.com>	Wed Jan 15 14:26:08 2020 +0000
tree	11dcb562b3d3fff68ab33cfcd94d67c4ea53e609
parent	5a675012b1087727fa419e10e7f3833d86a08587 [diff]