Fix and edit more alerts for k8s apps.
This is a followup to
https://skia-review.googlesource.com/c/buildbot/+/184622 to fix the
CrashLoop alert and add abbrs for a couple more alerts.
Change-Id: If73133e2e25b1a9800c7b8bc96336f89a667a6f8
Reviewed-on: https://skia-review.googlesource.com/c/184881
Reviewed-by: Kevin Lubick <kjlubick@google.com>
Reviewed-by: Ravi Mistry <rmistry@google.com>
Commit-Queue: Ben Wagner <benjaminwagner@google.com>
diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 5bcc616..6d8b943 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml
@@ -46,22 +46,6 @@
startup. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'
- - alert: CrashLoop
- expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
- for: 5m
- labels:
- category: infra
- severity: critical
- annotations:
- abbr: '{{ $labels.instance }}'
- description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
- startup. Logs:
-
- kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
-
- https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
- '
-
- alert: TooManyGoRoutines
expr: go_goroutines{kubernetes_pod_name!=""} > 3000
for: 2m
diff --git a/promk/prometheus/alerts_public.yml b/promk/prometheus/alerts_public.yml
index 36aa549..b14bef2 100644
--- a/promk/prometheus/alerts_public.yml
+++ b/promk/prometheus/alerts_public.yml
@@ -7,6 +7,24 @@
- name: general
rules:
+ # This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs
+ # for processes with this metric.
+ - alert: CrashLoop
+ expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
+ for: 5m
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ abbr: '{{ $labels.instance }}'
+ description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
+ startup. Logs:
+
+ kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
+
+ https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
+ '
+
# Container Builder
- alert: ContainerBuilderFailure
expr: ci_build_failure >= 2
@@ -121,7 +139,8 @@
severity: warning
owner: kjlubick@google.com
annotations:
- description: 'The Fuzzer hasnt rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version'
+ abbr: '{{ $labels.kubernetes_pod_name }}'
+ description: 'The Fuzzer on {{ $labels.kubernetes_pod_name }} hasnt rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version'
- alert: FuzzerSlowRoll
expr: fuzzer_version_age{type="pending"}/60/60 > 2
@@ -130,7 +149,8 @@
severity: critical
owner: kjlubick@google.com
annotations:
- description: 'The fuzzer hasnt finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll'
+ abbr: '{{ $labels.kubernetes_pod_name }}'
+ description: 'The fuzzer on {{ $labels.kubernetes_pod_name }} hasnt finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll'
- alert: FuzzerAnalysisQueueFull
expr: fuzzer_queue_size_analysis > 900000
diff --git a/promk/prometheus/prometheus-corp.yml b/promk/prometheus/prometheus-corp.yml
index dc94243..8784ead 100644
--- a/promk/prometheus/prometheus-corp.yml
+++ b/promk/prometheus/prometheus-corp.yml
@@ -17,6 +17,9 @@
- alert-to-pubsub:8000
scrape_configs:
+ # Note: If you add any new scrape_configs for processes that have a liveness_uptime_s metric,
+ # please move the CrashLoop metric in alerts_public.yml back to alerts_general.yml.
+
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs: