Add alerts for untriggered periodic jobs Add an alert for overdue weekly jobs. Bug: skia: Change-Id: I04752c05b239e9e5016321801db7bc8eed1f0b01 Reviewed-on: https://skia-review.googlesource.com/c/184383 Commit-Queue: Eric Boren <borenet@google.com> Reviewed-by: Ben Wagner <benjaminwagner@google.com>

commit: 0b7866fadc43f7c8b515529a0a9572b0ec581ef0 [log] [tgz]
author: Eric Boren <borenet@google.com> Wed Jan 16 12:23:31 2019 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Wed Jan 16 17:34:26 2019 +0000
tree: 4ecf2c442b3d8477331ea401ea5267b2673c1305
parent: 88f7fbab080a7fa798ab2011dcae89d76474bee5 [diff]
diff --git a/prometheus/sys/alert.rules b/prometheus/sys/alert.rules
index 7d4b260..8bcf13d 100644
--- a/prometheus/sys/alert.rules
+++ b/prometheus/sys/alert.rules

@@ -364,22 +364,6 @@
   }
 
 
-ALERT NightlyTrigger
-  IF liveness_periodic_trigger_s{trigger="nightly"}/60/60 > 25
-  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
-  ANNOTATIONS {
-    abbr = "{{ $labels.instance }}",
-    description = "The nightly trigger has not run in over 25 hours on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_nightly"
-  }
-
-ALERT WeeklyTrigger
-  IF liveness_periodic_trigger_s{trigger="weekly"}/60/60/24 > 8
-  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
-  ANNOTATIONS {
-    abbr = "{{ $labels.instance }}",
-    description = "The weekly trigger has not run in over 8 days on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_weekly"
-  }
-
 ALERT TaskSchedulerTooManyCandidates
   IF task_candidate_count > 1500
   LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
@@ -417,8 +401,6 @@
     description = "{{ $labels.job_name }} has not finished for any commit in the last 9 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
   }
 
-# Note: We don't have an alert for job_trigger="weekly" because Task Scheduler's
-# scheduling window is only four days.
 ALERT OverdueJobSpecNightly
   IF overdue_job_specs_s{job_trigger="nightly"}/60/60 > 28
   LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
@@ -427,6 +409,30 @@
     description = "{{ $labels.job_name }} has not completed in the last 28 hours (nightly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
   }
 
+ALERT OverdueJobSpecWeekly
+  IF overdue_job_specs_s{job_trigger="weekly"}/60/60 > 7*24+4
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
+  ANNOTATIONS {
+    abbr = "{{ $labels.job_name }}",
+    description = "{{ $labels.job_name }} has not completed in the last week + 4 hours (weekly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
+  }
+
+ALERT LatestJobAgeNightly
+  IF latest_job_age_s{job_trigger="nightly"}/60/60 > 25
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
+  ANNOTATIONS {
+    abbr = "{{ $labels.job_name }}",
+    description = "{{ $labels.job_name }} has not been triggered in the last 25 hours (nightly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#latest_job_age"
+  }
+
+ALERT LatestJobAgeWeekly
+  IF latest_job_age_s{job_trigger="weekly"}/60/60 > 7*24+1
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
+  ANNOTATIONS {
+    abbr = "{{ $labels.job_name }}",
+    description = "{{ $labels.job_name }} has not been triggered in the last week + 1 hour (weekly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#latest_job_age"
+  }
+
 
 # Skolo
 

diff --git a/task_scheduler/PROD.md b/task_scheduler/PROD.md
index a60db53..f4ae29d 100644
--- a/task_scheduler/PROD.md
+++ b/task_scheduler/PROD.md

@@ -177,3 +177,14 @@
       search UI](https://task-scheduler.skia.org/jobs/search) can be used to
       bulk-cancel jobs.
 
+
+latest_job_age
+--------------
+
+Jobs have not been triggered recently enough for the indicated job spec. This
+normally indicates that the periodic triggers have stopped working for some
+reason. Double check that the "periodic-trigger" cron jobs have run at the
+expected time in Kubernetes. If they have not, look into why. If they have,
+check the Task Scheduler logs to verify that the scheduler received the pubsub
+message and if so determine why it did not create the job.
+
commit	0b7866fadc43f7c8b515529a0a9572b0ec581ef0	[log] [tgz]
author	Eric Boren <borenet@google.com>	Wed Jan 16 12:23:31 2019 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Wed Jan 16 17:34:26 2019 +0000
tree	4ecf2c442b3d8477331ea401ea5267b2673c1305
parent	88f7fbab080a7fa798ab2011dcae89d76474bee5 [diff]