Convert Task Scheduler alerts to Prometheus.

BUG=skia:6249

Change-Id: I3017c5c13dbba63e62a6b2e5bfc1cf1339eebaa0
Reviewed-on: https://skia-review.googlesource.com/8613
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
Reviewed-by: Eric Boren <borenet@google.com>
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
diff --git a/alertserver/alerts.cfg b/alertserver/alerts.cfg
index eef94f3..6c90cb2 100644
--- a/alertserver/alerts.cfg
+++ b/alertserver/alerts.cfg
@@ -123,129 +123,3 @@
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"
-
-#
-# Task Scheduler
-#
-
-[[rule]]
-name = "Task Scheduler Failing (%(host)s)"
-message = "The task scheduler has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#scheduling_failed"
-database = "skmetrics"
-query = "select mean(value) from liveness where time > now() - 5m AND app='task_scheduler' and \"name\"='last-successful-task-scheduling' group by app, host"
-category = "infra"
-conditions = ["x > 10 * 60"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "1h"
-
-[[rule]]
-name = "Task Scheduler HTTP Latency"
-message = "https://task-scheduler.skia.org took more than 300ms to respond. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#http_latency"
-database = "skmetrics"
-query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='task_scheduler'"
-category = "infra"
-conditions = ["x / 1000000 > 300"]
-actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
-auto-dismiss = true
-nag = "1h"
-
-[[rule]]
-name = "Task Scheduler Error Rate (%(host)s)"
-message = "The error rate for task_scheduler on %(host)s is too high. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#error_rate"
-database = "skmetrics"
-query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='task_scheduler' AND level='ERROR' group by host order by time desc limit 1"
-category = "infra"
-conditions = ["x >= 5"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = false
-nag = "1h"
-
-[[rule]]
-name = "Task Scheduler DB backup (%(host)s)"
-message = "The last Task Scheduler DB backup was more than 25 hours ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#old_db_backup"
-database = "skmetrics"
-query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND \"name\" = 'last-db-backup' group by host"
-category = "infra"
-conditions = ["x > 25 * 60 * 60"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler extra DB backups (%(host)s)"
-message = "There are too many recent Task Scheduler DB backups. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#too_many_recent_db_backups"
-database = "skmetrics"
-query = "select mean(value) from \"recent-db-backup-count\" where time > now() - 1h AND app='task_scheduler' group by host"
-category = "infra"
-conditions = ["x > 9"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler DB backup trigger (%(host)s)"
-message = "The last time we checked for a Task Scheduler DB backup trigger file was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
-database = "skmetrics"
-query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'db-backup-maybe-backup-db' group by host"
-category = "infra"
-conditions = ["x > 10 * 60"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler incremental backup (%(host)s)"
-message = "The last time a Task Scheduler incremental backup succeeded was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_liveness"
-database = "skmetrics"
-query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup' group by host"
-category = "infra"
-conditions = ["x > 10 * 60"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler incremental backup reset (%(host)s)"
-message = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_reset"
-database = "skmetrics"
-query = "select mean(value) from \"counter\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup-reset' group by host"
-category = "infra"
-conditions = ["x > 0"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler DB excess free pages (%(host)s)"
-message = "There are a large number of free pages in the Task Scheduler DB. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_too_many_free_pages"
-database = "skmetrics"
-query = "select mean(value) from \"db\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND metric='FreePageCount' group by host"
-category = "infra"
-conditions = ["x > 150"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler Nightly Trigger (%(host)s)"
-message = "The Task Scheduler's nightly trigger has not run in over 25 hours. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_nightly"
-database = "skmetrics"
-query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='nightly' group by host"
-category = "infra"
-conditions = ["x > 25 * 60 * 60"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
-[[rule]]
-name = "Task Scheduler Weekly Trigger (%(host)s)"
-message = "The Task Scheduler's weekly trigger has not run in over 8 days. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_weekly"
-database = "skmetrics"
-query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='weekly' group by host"
-category = "infra"
-conditions = ["x > 8 * 24 * 60 * 60"]
-actions = ["Email(infra-alerts@skia.org)"]
-auto-dismiss = true
-nag = "6h"
-
diff --git a/prometheus/sys/alert.rules b/prometheus/sys/alert.rules
index 491747f..e462f0d1 100644
--- a/prometheus/sys/alert.rules
+++ b/prometheus/sys/alert.rules
@@ -342,3 +342,96 @@
     description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See See https://mon.skia.org/dashboard/db/ingestion.",
   }
 
+# Task Scheduler
+
+ALERT TaskSchedulerLiveness
+  IF liveness_last_successful_task_scheduling_s/60 > 10
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler Failing ({{ $labels.instance }})",
+    description = "{{ $labels.instance }} has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#scheduling_failed"
+  }
+
+ALERT TaskSchedulerLatency
+  IF prober{type="latency",probename="task_scheduler"} > 300
+  FOR 10m
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler HTTP Latency",
+    description = "The endpoint for {{ $labels.probename }} took more than 300ms to respond. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#http_latency"
+  }
+
+ALERT TaskSchedulerErrorRate
+  IF rate(num_log_lines{level="ERROR",log_source="task_scheduler"}[2m]) > 0.1
+  FOR 2m
+  LABELS { category = "infra", severity = "critical" }
+  ANNOTATIONS {
+    summary = "Task Scheduler Error Rate ({{ $labels.instance }})",
+    description = "The error rate for task_scheduler on {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#error_rate"
+  }
+
+ALERT TaskSchedulerDBBackup
+  IF liveness_last_db_backup_s/60/60 > 25
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler DB backup ({{ $labels.instance }})",
+    description = "The last Task Scheduler DB backup on {{ $labels.instance }} was more than 25 hours ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#old_db_backup"
+  }
+
+ALERT TaskSchedulerExtraDBBackups
+  IF recent_db_backup_count > 9
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler extra DB backups ({{ $labels.instance }})",
+    description = "There are too many recent Task Scheduler DB backups for {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_recent_db_backups"
+  }
+
+ALERT TaskSchedulerDBBackupTrigger
+  IF liveness_db_backup_maybe_backup_db_s/60 > 10
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler DB backup trigger ({{ $labels.instance }})",
+    description = "The last time we checked for a Task Scheduler DB backup trigger file on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
+  }
+
+ALERT TaskSchedulerIncrementalBackup
+  IF liveness_incremental_backup_s/60 > 10
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler incremental backup ({{ $labels.instance }})",
+    description = "The last time a Task Scheduler incremental backup succeeded on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_liveness"
+  }
+
+ALERT TaskSchedulerIncrementalBackupReset
+  IF incremental_backup_reset > 0
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler incremental backup reset ({{ $labels.instance }})",
+    description = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_reset"
+  }
+
+ALERT TaskSchedulerDBFreePages
+  IF bolt_db{metric="FreePageCount",database="task_scheduler_db"} > 150
+  FOR 1h
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler DB excess free pages ({{ $labels.instance }})",
+    description = "There are a large number of free pages in the Task Scheduler DB on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_too_many_free_pages"
+  }
+
+ALERT TaskSchedulerNightlyTrigger
+  IF liveness_task_scheduler_periodic_trigger_s{trigger="nightly"}/60/60 > 25
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler Nightly Trigger ({{ $labels.instance }})",
+    description = "The Task Scheduler's nightly trigger has not run in over 25 hours on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_nightly"
+  }
+
+ALERT TaskSchedulerWeeklyTrigger
+  IF liveness_task_scheduler_periodic_trigger_s{trigger="weekly"}/60/60/24 > 8
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    summary = "Task Scheduler Weekly Trigger ({{ $labels.instance }})",
+    description = "The Task Scheduler's weekly trigger has not run in over 8 days on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_weekly"
+  }
+