prometheus/sys/alert.rules - buildbot - Git at Google

 # General
 ALERT InstanceDown
   IF up == 0
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
   }

 ALERT TooManyGoRoutines
   IF go_goroutines > 3000
   FOR 2m
   LABELS { category = "infra", severity = "warning"}
   ANNOTATIONS {
     summary = "Too many Go routines in {{ $labels.job }}",
     description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}."
   }

 ALERT ProbeFailure
   IF prober{type="failure"} > 0
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     abbr = "{{ $labels.probename }} {{ $labels.url }}",
     description = "Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ $labels.probename }}+filename%3Aprobers.json for the endpoint URL."
   }

 ALERT RebootRequired
   IF reboot_required_i > 0
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Instance {{ $labels.host }} needs rebooting. Owner(s): {{ $labels.owners }}. See https://mon.skia.org/dashboard/db/reboots-required for the full list of instances that need rebooting.",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource="df-root",host!~".*rpi-.+"} < 1e9
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Low Root Disk Space on {{ $labels.host }}.",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource=~"df-mnt-pd0|df-mnt-dds0"} < 1e10
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Low Disk Space on {{ $labels.host }} on disk {{ $labels.resource }}.",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource="df-b",host!~".*rpi-.+"} < 2e10
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Free space has fallen below 20GB on {{ $labels.host }} drive {{ $labels.resource }}.\n\nTry running:\ngo run scripts/run_on_swarming_bots/run_on_swarming_bots.go --alsologtostderr --script=scripts/run_on_swarming_bots/delete_out_dirs.py --dimension id:{{ $labels.host }}",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-rpi-.+"} < 1e8
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}.",
   }

 ALERT DirtyPackages
   IF min_over_time(dirty_packages[25h]) >= 1
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "One or more dirty packages have been running for more than 24 hours. https://push.skia.org",
   }

 ALERT PackageInstall
   IF rate(pulld_failed_install[10m]) > 1
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Package failing to install via pulld on {{ $labels.host }}.",
   }

 ALERT TooManyOpenFDs
   IF process_open_fds > 2000
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.host }}",
     description = "Too many open file handles on {{ $labels.host }}.",
   }

 # CT
 ALERT CTPollerHealthCheck
   IF healthy{instance="skia-ct-master:20000",job="ct-poller"} != 1
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CT poller healthy check failed.",
     description = "CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check."
   }

 ALERT CTFEPendingTaskCount
   IF num_pending_tasks{instance="skia-ctfe:20000",job="ctfe"} >= 10
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CTFE pending task count too high.",
     description = "There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks"
   }

 ALERT CTFEPendingTaskStatus
   IF oldest_pending_task_status{instance="skia-ctfe:20000",job="ctfe"} >= 2
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CTFE pending task not running.",
     description = "A task has been waiting to be executed for a while and it's still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks"
   }

 ALERT CTFEErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="ctfe"}[2m]) > 0.1
   FOR 2m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CTFE error rate too high.",
     description = "The error rate for CTFE is too high. See cloud logging for skia-ctfe."
   }

 # CQ Watcher
 ALERT CQWatcherCLsCount
   IF cq_watcher_in_flight_waiting_in_cq{instance="skia-cq-watcher:20000",job="cq_watcher"} >= 10
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     summary = "Too many CLs in CQ.",
     description = "There are 10 CLs or more in Skia's CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls"
   }

 ALERT CQWatcherTrybotDuration
   IF max_over_time(cq_watcher_in_flight_trybot_duration{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 2700
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     summary = "CQ trybot running for too long.",
     description = "{{ $labels.trybot }} ran longer than 45 mins. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold"
   }

 ALERT CQWatcherTrybotsCount
   IF max_over_time(cq_watcher_in_flight_trybot_num{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 35
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     summary = "Too many CQ trybots triggered by CL.",
     description = "There are more than 35 CQ trybots triggered by at least one CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered"
   }

 # Perf
 ALERT PerfUntriagedClusters
   IF perf_clustering_untriaged{instance="skia-perf:20000"} > 0
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     summary = "One or more untriaged clusters.",
     description = "At least one untriaged perf cluster has been found. Please visit https://perf.skia.org/t/ to triage."
   }

 ALERT AndroidPerfUntriagedClusters
   IF perf_clustering_untriaged{instance=~"skia-android-perf:20000"} > 0
   LABELS { category = "general", severity = "warning", specialroute = "android" }
   ANNOTATIONS {
     summary = "One or more untriaged clusters.",
     description = "At least one untriaged perf cluster has been found. Please visit https://{{ $labels.subdomain }}.skia.org/t/ to triage."
   }

 ALERT AndroidMasterPerfUntriagedClusters
   IF perf_clustering_untriaged{instance=~"skia-android-master-perf:20000"} > 0
   LABELS { category = "general", severity = "warning", specialroute = "android-master" }
   ANNOTATIONS {
     summary = "One or more untriaged clusters.",
     description = "At least one untriaged perf cluster has been found. Please visit https://{{ $labels.subdomain }}.skia.org/t/ to triage."
   }

 ALERT AndroidIngestFailures
   IF rate(process_failures[2m]) > 0.1
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     description = "Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures."
   }

 ALERT PerfIngestErrorTooHigh
   IF rate(ingestion{metric="errors"}[5m]) > 1
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     description = "Perf ingestion error rate too high. See https://prom.skia.org/graph?g0.range_input=1h&g0.expr=rate(ingestion%7Bmetric%3D%22errors%22%7D%5B5m%5D)&g0.tab=0"
   }

 # For fiddle and debugger.

 ALERT BuildsFailed
   IF builds_failed >= 2
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.job }}",
     description = "Build for {{ $labels.job }} has failed for the last 2 chrome DEPS rolls. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#build_fail"
   }

 ALERT SyncFailed
   IF repo_sync_failed >= 2
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     abbr = "{{ $labels.job }}",
     description = "Sync for {{ $labels.job }} has failed 2 times in a row. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#sync_fail"
   }

 ALERT NamedFiddles
   IF named_failures > 0
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "See https://fiddle.skia.org/f/ and https://skia.googlesource.com/buildbot/%2B/master/fiddle/PROD.md#named_fail"
   }

 # datahopper_internal

 ALERT Google3AutorollStalled
   IF datahopper_internal_ingest_build_webhook_oldest_untested_commit_age/60/60 > 3
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Commit has not been picked up by Google3-Autoroller for over three hours. https://sites.google.com/a/google.com/skia-infrastructure/docs/google3-autoroller."
   }

 ALERT DatahopperInternalUpdateWebhookMetrics
   IF liveness_ingest_build_webhook_oldest_untested_commit_age_metric_s/60 > 10
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "datahopper_internal goroutine for updateWebhookMetrics is dead or failing."
   }

 ALERT TooManyGoRoutines
   IF go_goroutines{instance="skia-internal:20000",job="datahopper_internal"} > 100
   FOR 2m
   LABELS { category = "infra", severity = "warning"}
   ANNOTATIONS {
     description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}."
   }

 # Datahopper

 ALERT DatahopperErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="datahopper"}[10m]) > 5
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-datahopper2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fdatahopper"
   }

 # Swarming

 ALERT BotMissing
   IF swarming_bots_last_seen{bot!~"ct-gce-.*"}/1000/1000/1000/60 > 15
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     abbr = "{{ $labels.bot }}",
     description = "Swarming bot {{ $labels.bot }} is missing. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
   }

 ALERT CtGceBotMissing
   IF max(swarming_bots_last_seen{bot=~"ct-gce-.*"})/1024/1024/1024/60 * max(ct_gce_bots_up) > 15
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "1 or more CT GCE bots are down: https://chrome-swarming.appspot.com/botlist?f=status%3Adead&f=gpu%3Anone&f=pool%3ACT&l=100"
   }

 ALERT BotUnemployed
   IF swarming_bots_last_task{pool=~"Skia.*"}/1000/1000/1000/60/60 >= 72
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     abbr = "{{ $labels.bot }}",
     description = "Swarming bot {{ $labels.bot }} hasn't run a job in 72 hours. Maybe it's dimensions need changing? https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
   }

 ALERT BotQuarantined
   IF avg_over_time(swarming_bots_quarantined[10m]) >= 1
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     abbr = "{{ $labels.bot }}",
     description = "Swarming bot {{ $labels.bot }} is quarantined. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
   }
 # Swarming Logger

 ALERT SwarmingLoggerErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="swarming_logger"}[10m]) > 1
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The error rate for swarming_logger is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-swarming-logger2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fswarming_logger"
   }

 # Fuzzer

 ALERT FuzzerUploadQueue
   IF fuzzer_queue_size_upload > 90
   FOR 2m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Fuzzer upload queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload"
   }

 ALERT FuzzerAnalysisQueue
   IF fuzzer_queue_size_analysis > 9000
   FOR 2m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Fuzzer analysis queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis"
   }

 ALERT FuzzerStaleVersion
   IF fuzzer_version_age{type="current"}/60/60/24 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The Fuzzer hasn't rolled its version forward in 10 days.  Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version"
   }

 ALERT FuzzerSlowRoll
   IF fuzzer_version_age{type="pending"}/60/60 > 2
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The fuzzer hasn't finished rolling its version forward in 2 hours.  Something might be wrong.  https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll"
   }

 # Status

 ALERT StatusLatency
   IF avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024  > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond."
   }

 # Prober

 ALERT ProberLiveness
   IF liveness_probes_s/60 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Probing has failed to run in at least 10 minutes."
   }

 ALERT IssueTrackerLiveness
   IF liveness_issue_tracker_s/60 > 30
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Issue tracker ingestion has failed to run in at least 30 minutes."
   }

 # Autoroller

 ALERT SkiaAutoRoll
   IF autoroll_last_roll_result{child_path="src/third_party/skia"} == 0
   FOR 10m
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     description = "The last DEPS roll attempt for Skia failed. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#autoroll_failed."
   }

 ALERT SkiaAutoRoll24H
   IF liveness_last_autoroll_landed_s{child_path="src/third_party/skia"}/60/60 > 24
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     description = "The last-landed AutoRoll for Skia was over 24h ago. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#no_rolls_24h."
   }

 ALERT AutoRollLatency
   IF prober{type="latency",probename="autoroll"} > 200
   FOR 10m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency."
   }

 ALERT AutoRollErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="autoroll"}[10m]) > 0.001
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     description = "The error rate for autoroll on {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate."
   }

 # Gold

 ALERT GoldUntriaged
   IF gold_status_by_corpus{type="untriaged",instance="skia-gold-prod:20001"} > 0
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     description = "At least one untriaged GM has been found. Please visit https://gold.skia.org/ to triage.",
   }

 ALERT GoldExpiredIgnores
   IF gold_num_expired_ignore_rules{instance="skia-gold-prod:20001"} > 0
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     description = "At least one expired ignore rule has been found. Please visit https://gold.skia.org/ignores to delete or extend.",
   }

 ALERT GoldIgnoreMonitoring
   IF liveness_gold_expired_ignore_rules_monitoring_s{instance="skia-gold-prod:20001"} > 200
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "At least two rounds of monitoring for expired ignore rules have failed back to back.",
   }

 ALERT GoldErrorRate
   IF rate(num_log_lines{level="ERROR",job="gold"}[2m]) > 1
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The error rate for Gold {{ $labels.instance }} is too high.",
   }

 ALERT GoldIngestionStalled
   IF liveness_gold_s{metric="since-last-run",source="poll"} > 750
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/ingestion.",
   }

 ALERT GoldIngestionErrorRate
   IF rate(num_log_lines{level="ERROR",job="ingestion"}[2m]) > 1
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See See https://mon.skia.org/dashboard/db/ingestion.",
   }

 # Task Scheduler

 ALERT TaskSchedulerLiveness
   IF liveness_last_successful_task_scheduling_s/60 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler Failing ({{ $labels.instance }})",
     description = "{{ $labels.instance }} has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#scheduling_failed"
   }

 ALERT TaskSchedulerLatency
   IF prober{type="latency",probename="task_scheduler"} > 300
   FOR 10m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler HTTP Latency",
     description = "The endpoint for {{ $labels.probename }} took more than 300ms to respond. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#http_latency"
   }

 ALERT TaskSchedulerErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="task_scheduler"}[2m]) > 0.05
   FOR 2m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "Task Scheduler Error Rate ({{ $labels.instance }})",
     description = "The error rate for task_scheduler on {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#error_rate"
   }

 ALERT TaskSchedulerDBBackup
   IF liveness_last_db_backup_s/60/60 > 25
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler DB backup ({{ $labels.instance }})",
     description = "The last Task Scheduler DB backup on {{ $labels.instance }} was more than 25 hours ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#old_db_backup"
   }

 ALERT TaskSchedulerExtraDBBackups
   IF recent_db_backup_count > 9
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler extra DB backups ({{ $labels.instance }})",
     description = "There are too many recent Task Scheduler DB backups for {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_recent_db_backups"
   }

 ALERT TaskSchedulerDBBackupTrigger
   IF liveness_db_backup_maybe_backup_db_s/60 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler DB backup trigger ({{ $labels.instance }})",
     description = "The last time we checked for a Task Scheduler DB backup trigger file on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
   }

 ALERT TaskSchedulerIncrementalBackup
   IF liveness_incremental_backup_s/60 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler incremental backup ({{ $labels.instance }})",
     description = "The last time a Task Scheduler incremental backup succeeded on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_liveness"
   }

 ALERT TaskSchedulerIncrementalBackupReset
   IF incremental_backup_reset > 0
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler incremental backup reset ({{ $labels.instance }})",
     description = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_reset"
   }

 ALERT TaskSchedulerDBFreePages
   IF bolt_db{metric="FreePageCount",database="task_scheduler_db",instance!="skia-task-scheduler-internal:20000"} > 150
   FOR 1h
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler DB excess free pages ({{ $labels.instance }})",
     description = "There are a large number of free pages in the Task Scheduler DB on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_too_many_free_pages"
   }

 ALERT TaskSchedulerNightlyTrigger
   IF liveness_task_scheduler_periodic_trigger_s{trigger="nightly"}/60/60 > 25
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler Nightly Trigger ({{ $labels.instance }})",
     description = "The Task Scheduler's nightly trigger has not run in over 25 hours on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_nightly"
   }

 ALERT TaskSchedulerWeeklyTrigger
   IF liveness_task_scheduler_periodic_trigger_s{trigger="weekly"}/60/60/24 > 8
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     summary = "Task Scheduler Weekly Trigger ({{ $labels.instance }})",
     description = "The Task Scheduler's weekly trigger has not run in over 8 days on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_weekly"
   }

 # Skolo

 ALERT BackupNotDone
   IF liveness_skolo_last_backup_s{}/60/60 > 25
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     abbr = "{{ $labels.federated }} {{ $labels.backup }}",
     description = "The automated backup of {{ $labels.backup }} was not completed by {{ $labels.federated }} in the last 25 hours. http://go/skolo-maintenance",
   }

 ALERT RpiMaster
   IF max_over_time(skolo_hotspare_consecutive_failures[10m]) > 600
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "skia-rpi-master-spare has been active for more than 10 minutes. Something is probably wrong with skia-rpi-master. http://go/skolo-maintenance.",
   }

 #
 # QPS to external services.
 #

 # General.
 # TODO(borenet): Specific alerts for Swarming.
 ALERT HighExternalQPS
   IF sum(rate(http_request_metrics{job!="ct-poller"}[30m])) by (host) > 8
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "QPS to {{ $labels.host }} is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps"
   }

 # CTPoller has a higher threshold than the above alert due to the autoscaler.
 ALERT HighExternalQPSCTPoller
   IF sum(rate(http_request_metrics{job="ct-poller"}[30m])) by (host) > 10
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "QPS to {{ $labels.host }} from the ct-poller is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps"
   }


 # Gerrit
 ALERT HighQPSGerritSkiaShortTerm
   IF rate(http_request_metrics{host="skia-review.googlesource.com"}[5m]) > 5
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Short-term QPS to Gerrit (Skia) is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps"
   }

 ALERT HighQPSGerritChromeShortTerm
   IF rate(http_request_metrics{host="chromium-review.googlesource.com"}[5m]) > 5
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Short-term QPS to Gerrit (Chromium) is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps"
   }

 ALERT HighQPSGerritSkiaLongTerm
   IF rate(http_request_metrics{host="skia-review.googlesource.com"}[12h]) > 1
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Long-term QPS to Gerrit (Skia) is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps"
   }

 ALERT HighQPSGerritChromeLongTerm
   IF rate(http_request_metrics{host="chromium-review.googlesource.com"}[12h]) > 1
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Long-term QPS to Gerrit (Chromium) is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps"
   }