Add alert for Swarming bot uptime
Change-Id: I0d764fbec4e6fd8dfc057601c3c9c8f55a400631
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/270950
Reviewed-by: Kevin Lubick <kjlubick@google.com>
Commit-Queue: Eric Boren <borenet@google.com>
diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 13c0350..2325a7e 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml
@@ -464,6 +464,15 @@
abbr: '{{ $labels.bot }}'
description: 'Swarming bot {{ $labels.bot }} is quarantined because the device is {{ $labels.device_state }} and has not resolved itself in 1+ hours. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'
+ - alert: BotUptime
+ expr: swarming_bots_uptime_s{bot!="skia-rpi-template"} / 60 / 60 > 36
+ labels:
+ category: infra
+ severity: critical
+ annotations:
+ abbr: '{{ $labels.bot }}'
+ description: 'Swarming bot {{ $labels.bot }} has gone too long without a reboot. Check the events on the Swarming bot page and reboot manually if necessary. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'
+
# Alerts for supported branches.
- alert: MissingCQConfigForSupportedBranch