[thanos] Add monitoring for thanos
Change-Id: Iecf67202004f795272d03ecca6632e0bc17094d0
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/264558
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/promk/Makefile b/promk/Makefile
index 776b5c2..60b314f 100644
--- a/promk/Makefile
+++ b/promk/Makefile
@@ -5,8 +5,10 @@
promk-absent --input=prometheus/alerts_corp.yml --output=prometheus/absent_alerts_corp.yml
promk-absent --input=prometheus/alerts_general.yml --output=prometheus/absent_alerts_general.yml
promk-absent --input=prometheus/alerts_public.yml --output=prometheus/absent_alerts_public.yml
+ promk-absent --input=prometheus/alerts_thanos.yml --output=prometheus/absent_alerts_thanos.yml
-./tmp/promtool check config ./prometheus/prometheus-public.yml
-./tmp/promtool check config ./prometheus/prometheus-corp.yml
+ -./tmp/promtool check config ./prometheus/prometheus-rack4.yml
./tmp/promtool check rules ./prometheus/alerts_*.yml
./tmp/promtool check rules ./prometheus/absent_*.yml
# Please check output above for FAILED. If the only error is "FAILED: error checking bearer
@@ -40,6 +42,8 @@
-rm -rf ./tmpconf
mkdir ./tmpconf
cp prometheus/prometheus-public.yml ./tmpconf/prometheus.yml
+ cp prometheus/alerts_thanos.yml ./tmpconf/
+ cp prometheus/absent_alerts_thanos.yml ./tmpconf/
# Need to use replace so that the configmap gets updated. Change "replace"
# to "create" if this is the first time the configmap has been uploaded.
kubectl create configmap prometheus-server-conf --from-file=./tmpconf -o yaml --dry-run | kubectl replace -f -
diff --git a/promk/prometheus/alerts_thanos.yml b/promk/prometheus/alerts_thanos.yml
new file mode 100644
index 0000000..f4786f1
--- /dev/null
+++ b/promk/prometheus/alerts_thanos.yml
@@ -0,0 +1,33 @@
+# Alerts for Thanos, which we fire from both thanos and the skia-public
+# prometheus instance for safety.
+#
+groups:
+- name: thanos
+ rules:
+ - alert: PrometheusRuleEvaluationFailures
+ expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
+ for: 5m
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ abbr: '{{ $labels.instance }}'
+ description: 'Thanos/Prometheus is failing to evaluate rules, check {{ $labels.kubernetes_pod_name }} pod logs'
+ - alert: ThanosRuleIsDroppingAlerts
+ expr: rate(thanos_alert_queue_alerts_dropped_total[5m]) > 0
+ for: 5m
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ abbr: '{{ $labels.instance }}'
+ description: 'Thanos Rule is dropping alerts, check {{ $labels.kubernetes_pod_name }} pod logs'
+ - alert: ThanosRuleGrpcErrorRate
+ expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable"}[5m]) > 0
+ for: 5m
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ abbr: '{{ $labels.instance }}'
+ description: Thanos Rule is returning Internal/Unavailable errors, check {{ $labels.kubernetes_pod_name }} pod logs.
\ No newline at end of file
diff --git a/promk/prometheus/prometheus-public.yml b/promk/prometheus/prometheus-public.yml
index b25397f..a66fa8f 100644
--- a/promk/prometheus/prometheus-public.yml
+++ b/promk/prometheus/prometheus-public.yml
@@ -4,14 +4,27 @@
external_labels:
cluster: skia-public
+rule_files:
+ - "alerts_thanos.yml"
+ - "absent_alerts_thanos.yml"
+
+alerting:
+ alertmanagers:
+ - static_configs:
+ - targets:
+ - alert-to-pubsub:8000
+
scrape_configs:
# Scrape configs for pods that run multiple containers. If a pod doesn't
# run multiple containers then just use annotations to get it scraped.
# Otherwise add the second, third, fourth, etc. port here.
- job_name: 'auth-proxy'
static_configs:
+ - targets: ['prometheus:9000']
- targets: ['prometheus:10000']
- targets: ['collectd-exporter:10000']
+ - targets: ['thanos-query:10000']
+ - targets: ['thanos-rule:10000']
- job_name: 'kubernetes-apiservers'