[alerts] Add TooManyPodRestarts alert

This alert fires when a pod restarts more than twice in an hour. Note
that pushing a new version does not count as a restart, so this should
not cause false positives in those cases.

Bug: skia:10543
Change-Id: Ic7e34f9d6b07772a9cd3681267437be03c141ab0
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/307556
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Reviewed-by: Ravi Mistry <rmistry@google.com>
Commit-Queue: Eric Boren <borenet@google.com>
diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 7f9e063..c9186c2 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml
@@ -640,6 +640,15 @@
       abbr: '{{ $labels.container }}'
       description: 'The app {{ $labels.exported_app }} is running in {{ $labels.project }} but is not checked into {{ $labels.repo }}'
 
+  - alert: TooManyPodRestarts
+    expr: rate(pod_restart_count{}[1h]) * 60 *60 > 2
+    labels:
+      category: infra
+      severity: critical
+    annotations:
+      abbr: '{{ $labels.container }}'
+      description: 'Container {{ $labels.container }} of app {{ $labels.exported_app }} is restarting too often.'
+
 # GitSync.
   - alert: GitSyncStalled
     expr: liveness_last_successful_git_sync_s > 5*60