Add alerts for untriggered periodic jobs
Add an alert for overdue weekly jobs.
Bug: skia:
Change-Id: I04752c05b239e9e5016321801db7bc8eed1f0b01
Reviewed-on: https://skia-review.googlesource.com/c/184383
Commit-Queue: Eric Boren <borenet@google.com>
Reviewed-by: Ben Wagner <benjaminwagner@google.com>
diff --git a/prometheus/sys/alert.rules b/prometheus/sys/alert.rules
index 7d4b260..8bcf13d 100644
--- a/prometheus/sys/alert.rules
+++ b/prometheus/sys/alert.rules
@@ -364,22 +364,6 @@
}
-ALERT NightlyTrigger
- IF liveness_periodic_trigger_s{trigger="nightly"}/60/60 > 25
- LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
- ANNOTATIONS {
- abbr = "{{ $labels.instance }}",
- description = "The nightly trigger has not run in over 25 hours on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_nightly"
- }
-
-ALERT WeeklyTrigger
- IF liveness_periodic_trigger_s{trigger="weekly"}/60/60/24 > 8
- LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
- ANNOTATIONS {
- abbr = "{{ $labels.instance }}",
- description = "The weekly trigger has not run in over 8 days on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_weekly"
- }
-
ALERT TaskSchedulerTooManyCandidates
IF task_candidate_count > 1500
LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
@@ -417,8 +401,6 @@
description = "{{ $labels.job_name }} has not finished for any commit in the last 9 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
}
-# Note: We don't have an alert for job_trigger="weekly" because Task Scheduler's
-# scheduling window is only four days.
ALERT OverdueJobSpecNightly
IF overdue_job_specs_s{job_trigger="nightly"}/60/60 > 28
LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
@@ -427,6 +409,30 @@
description = "{{ $labels.job_name }} has not completed in the last 28 hours (nightly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
}
+ALERT OverdueJobSpecWeekly
+ IF overdue_job_specs_s{job_trigger="weekly"}/60/60 > 7*24+4
+ LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
+ ANNOTATIONS {
+ abbr = "{{ $labels.job_name }}",
+ description = "{{ $labels.job_name }} has not completed in the last week + 4 hours (weekly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
+ }
+
+ALERT LatestJobAgeNightly
+ IF latest_job_age_s{job_trigger="nightly"}/60/60 > 25
+ LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
+ ANNOTATIONS {
+ abbr = "{{ $labels.job_name }}",
+ description = "{{ $labels.job_name }} has not been triggered in the last 25 hours (nightly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#latest_job_age"
+ }
+
+ALERT LatestJobAgeWeekly
+ IF latest_job_age_s{job_trigger="weekly"}/60/60 > 7*24+1
+ LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
+ ANNOTATIONS {
+ abbr = "{{ $labels.job_name }}",
+ description = "{{ $labels.job_name }} has not been triggered in the last week + 1 hour (weekly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#latest_job_age"
+ }
+
# Skolo
diff --git a/task_scheduler/PROD.md b/task_scheduler/PROD.md
index a60db53..f4ae29d 100644
--- a/task_scheduler/PROD.md
+++ b/task_scheduler/PROD.md
@@ -177,3 +177,14 @@
search UI](https://task-scheduler.skia.org/jobs/search) can be used to
bulk-cancel jobs.
+
+latest_job_age
+--------------
+
+Jobs have not been triggered recently enough for the indicated job spec. This
+normally indicates that the periodic triggers have stopped working for some
+reason. Double check that the "periodic-trigger" cron jobs have run at the
+expected time in Kubernetes. If they have not, look into why. If they have,
+check the Task Scheduler logs to verify that the scheduler received the pubsub
+message and if so determine why it did not create the job.
+