[alerts] Add TooManyPodRestarts alert
This alert fires when a pod restarts more than twice in an hour. Note
that pushing a new version does not count as a restart, so this should
not cause false positives in those cases.
Bug: skia:10543
Change-Id: Ic7e34f9d6b07772a9cd3681267437be03c141ab0
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/307556
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Reviewed-by: Ravi Mistry <rmistry@google.com>
Commit-Queue: Eric Boren <borenet@google.com>
diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 7f9e063..c9186c2 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml
@@ -640,6 +640,15 @@
abbr: '{{ $labels.container }}'
description: 'The app {{ $labels.exported_app }} is running in {{ $labels.project }} but is not checked into {{ $labels.repo }}'
+ - alert: TooManyPodRestarts
+ expr: rate(pod_restart_count{}[1h]) * 60 *60 > 2
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ abbr: '{{ $labels.container }}'
+ description: 'Container {{ $labels.container }} of app {{ $labels.exported_app }} is restarting too often.'
+
# GitSync.
- alert: GitSyncStalled
expr: liveness_last_successful_git_sync_s > 5*60