promk/prometheus/alerts_general.yml - buildbot - Git at Google

 # Alerts we need no matter what is running in the cluster.
 groups:
 - name: general
   rules:

   # General -- each of these alerts has two forms:
   # - Scraped by annotation: these have a kubernetes_pod_name label
   # - Scraped by config: these have instance and job labels.
   - alert: InstanceDown
     expr: up{kubernetes_pod_name!=""} == 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.kubernetes_pod_name }}'
       description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} has been down
         for more than 5 minutes. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
         https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

   - alert: InstanceDown
     expr: up{kubernetes_pod_name=""} == 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.instance }}'
       description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
         for more than 5 minutes. Logs:

           kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

           https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
           '

   - alert: PodContainerCreatingTooLong
     expr: k8s_pod_status{status="ContainerCreating"} > 30 * 60
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.pod }}'
       description: '{{ $labels.pod }} of container {{ $labels.container }} in project {{ $labels.project }} has been in {{ $labels.status }} state for more than 30 minutes.'

   - alert: PodTerminatingTooLong
     expr: k8s_pod_status{status="Terminating"} > 30 * 60
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.pod }}'
       description: '{{ $labels.pod }} of container {{ $labels.container }} in project {{ $labels.project }} has been in {{ $labels.status }} state for more than 30 minutes.'

   - alert: CrashLoop
     expr: max_over_time(liveness_uptime_s{kubernetes_pod_name!=""}[6m]) < 60 * 3
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.kubernetes_pod_name }}'
       description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} is crashing on
         startup. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
         https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

   - alert: TooManyGoRoutines
     expr: go_goroutines{app=~".+"} > 3000
     for: 2m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Too many Go routines in {{ $labels.kubernetes_pod_name }} for app
         {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
         https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

   - alert: TooManyOpenFDs
     expr: process_open_fds{app=~".+"} > 5000
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Too many open file handles on {{ $labels.kubernetes_pod_name }} for app
         {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
         https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

   - alert: PersistentVolumeLowSpace
     expr: (kubelet_volume_stats_used_bytes /kubelet_volume_stats_capacity_bytes) > 0.9
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.persistentvolumeclaim }}'
       description: '{{ $labels.persistentvolumeclaim }} is more than 90% full.'

   - alert: ContainerVolumeLowSpace
     expr: (container_fs_usage_bytes/container_fs_limit_bytes) > 0.9
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.instance }}'
       description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.'

   - alert: AutoRollBackendErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for autoroll on {{ $labels.app }} is too high.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
       https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'

   - alert: AutoRollFrontendErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"autoroll-fe.*"}[1h]) > 0.001
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for autoroll on {{ $labels.app }} is too high.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
       https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'

   - alert: AutoRollLastTransition
     expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 20*60
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.roller }}'
       description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 20 minutes.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: HighExternalQPS
     expr: sum(rate(http_request_metrics{host!="www.googleapis.com"}[30m])) by (host) > 25
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.host }}'
       description: 'QPS to {{ $labels.host }} is high. Verify that this is expected.'

   - alert: HighExternalQPSGoogleAPIs
     expr: sum(rate(http_request_metrics{host="www.googleapis.com"}[30m])) > 60
     labels:
       category: infra
       severity: warning
     annotations:
       description: 'QPS to www.googleapis.com is high. Verify that this is expected.'

   - alert: HighFirestoreUsageSustained
     expr: sum(rate(firestore_ops_count{count="rows", app!~"gold.+"}[24h])) by (app) > 25
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Firestore usage from {{ $labels.app }} over the last 24h is high. Verify that this is expected, and adjust the alert threshold if it is. Dashboard: https://grafana2.skia.org/d/H4cyODhZz/firestore-read-write-rate?orgId=1&from=now-24h&to=now'

   - alert: HighFirestoreUsageSustainedGold
     expr: sum(rate(firestore_ops_count{count="rows", app=~"gold.+"}[24h])) by (app) > 60
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Firestore usage from {{ $labels.app }} over the last 24h is high. Verify that this is expected, and adjust the alert threshold if it is. Dashboard: https://grafana2.skia.org/d/H4cyODhZz/firestore-read-write-rate?orgId=1&from=now-24h&to=now'

   - alert: HighFirestoreUsageBurst
     expr: sum(rate(firestore_ops_count{count="rows"}[30m])) by (app) > 1000
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Firestore usage from {{ $labels.app }} over the last 30m is high. Verify that this is expected, and adjust the alert threshold if it is. Dashboard: https://grafana2.skia.org/d/H4cyODhZz/firestore-read-write-rate?orgId=1&from=now-30m&to=now'

   - alert: AutoRollGetSheriffFailed
     expr: autoroll_get_sheriff_success == 0
     for: 2h
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.roller }}'
       description: 'Autoroll on {{ $labels.app }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
       '

 # Skia Status
   - alert: StatusLatency
     expr: prober{type="latency",probename="skiastatus_json"}/1000 > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.probename }}'
       description: 'The endpoint for {{ $labels.probename }} took more than 10s to respond. https://skia.googlesource.com/buildbot/%2B/master/status/PROD.md#http_latency Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: StatusIncrementalCacheUpdate
     expr: liveness_last_successful_incremental_cache_update_s > 5*60
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'IncrementalCache UpdateLoop on {{ $labels.app }} has failed to update data for more than 5 minutes. Playbook: https://skia.googlesource.com/buildbot/%2B/master/status/PROD.md#incremental_cache_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: StatusErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"status.*"}[2m]) > 0.05
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for status on {{ $labels.app }} is too high. https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

 # Task Scheduler

   - alert: TaskSchedulerLiveness
     expr: liveness_last_successful_task_scheduling_s/60 > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#scheduling_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: TaskSchedulerUpdateReposLiveness
     expr: liveness_last_successful_repo_update_s/60 > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update repos and insert new jobs for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#update_repos_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: TaskSchedulerBuildbucketPollLiveness
     expr: liveness_last_successful_poll_buildbucket_for_new_tryjobs_s/60 > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to poll Buildbucket for new tryjobs for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#poll_buildbucket_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: TaskSchedulerBuildbucketUpdateLiveness
     expr: liveness_last_successful_update_buildbucket_tryjob_state_s/60 > 2
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to send updates to Buildbucket for in-progress and completed tryjobs for the last 2 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#update_buildbucket_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: TaskSchedulerLatency
     expr: prober{type="latency",probename="task_scheduler"} > 300
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.probename }}'
       description: 'The endpoint for {{ $labels.probename }} took more than 300ms to respond. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#http_latency Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: TaskSchedulerErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"task-scheduler.*"}[2m]) > 0.05
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for task_scheduler on {{ $labels.app }} is too high. https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'


   - alert: TaskSchedulerTooManyCandidates
     expr: task_candidate_count > 1500
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'There are too many task candidates for dimensions: {{ $labels.dimensions }} https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_candidates'

   - alert: OverdueMetricsLiveness
     expr: liveness_last_successful_overdue_metrics_update_s/60 > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update overdue_job_specs_s for the last 10 minutes. Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_metrics_liveness Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

 # These jobs have tasks with an expiration of 4 hours, and we allow 2 attempts, so they should
 # normally finish within 8 hours.
   - alert: OverdueJobSpec
     expr: overdue_job_specs_s{job_trigger=~"|master",job_name!~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 8
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.job_name }}'
       description: '{{ $labels.job_name }} has not finished for any commit in the last 8 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec'

 # These jobs have tasks with an expiration of 9 hours, and we allow 2 attempts, so they should
 # normally finish within 18 hours.
   - alert: OverdueJobSpecLong
     expr: overdue_job_specs_s{job_trigger=~"|master",job_name=~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 18
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.job_name }}'
       description: '{{ $labels.job_name }} has not finished for any commit in the last 9 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec'

   - alert: OverdueJobSpecNightly
     expr: overdue_job_specs_s{job_trigger="nightly"}/60/60 > 28
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.job_name }}'
       description: '{{ $labels.job_name }} has not completed in the last 28 hours (nightly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec'

   - alert: OverdueJobSpecWeekly
     expr: overdue_job_specs_s{job_trigger="weekly"}/60/60 > 7*24+4
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.job_name }}'
       description: '{{ $labels.job_name }} has not completed in the last week + 4 hours (weekly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec'

   - alert: LatestJobAgeNightly
     expr: latest_job_age_s{job_trigger="nightly"}/60/60 > 25
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.job_name }}'
       description: '{{ $labels.job_name }} has not been triggered in the last 25 hours (nightly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#latest_job_age'

   - alert: LatestJobAgeWeekly
     expr: latest_job_age_s{job_trigger="weekly"}/60/60 > 7*24+1
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.job_name }}'
       description: '{{ $labels.job_name }} has not been triggered in the last week + 1 hour (weekly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#latest_job_age'

 # Datahopper

   - alert: DatahopperErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"datahopper.*"}[10m]) > 5
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: JobMetricsLiveness
     expr: liveness_last_successful_job_metrics_update_s/60 > 30
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update job metrics for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#job_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: BotCoverageMetricsLiveness
     expr: liveness_last_successful_bot_coverage_metrics_s/60 > 60
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update bot coverage metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#bot_coverage_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: SwarmingTaskMetricsLiveness
     expr: liveness_last_successful_swarming_task_metrics_s/60 > 60
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update swarming task metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#swarming_task_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: EventMetricsLiveness
     expr: liveness_last_successful_event_metrics_update_s/60 > 30
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update event metrics for {{ $labels.measurement }} for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#event_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: SwarmingBotMetricsLiveness
     expr: liveness_last_successful_report_bot_metrics_s/60 > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.pool }}'
       description: '{{ $labels.app }} has failed to update swarming task metrics for pool {{ $labels.pool }} on {{ $labels.server }} for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#swarming_bot_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

   - alert: FirestoreBackupMetricsLiveness
     expr: liveness_last_successful_firestore_backup_metrics_update_s/60 > 15
     labels:
       category: infra
       severity: critical
       owner: benjaminwagner@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has failed to update Firestore backup metrics for the last 15 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#firestore_backup_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

 # Swarming

   - alert: BotMissing
     expr: swarming_bots_last_seen{bot!~"(ct-gce-.*)|(build4.+device.+)|skia-rpi-template"}/1000/1000/1000/60 > 15
     labels:
       category: infra
       severity: critical
       abbr_owner_regex: rmistry@google.com:^build[0-9]+-m5$
     annotations:
       abbr: '{{ $labels.bot }}'
       description: 'Swarming bot {{ $labels.bot }} is missing. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'

   - alert: BotUnemployed
     expr: swarming_bots_last_task{pool=~"Skia.*",bot!="skia-rpi-template"}/1000/1000/1000/60/60 >= 72
     labels:
       category: infra
       severity: critical
       abbr_owner_regex: rmistry@google.com:^build[0-9]+-m5$
     annotations:
       abbr: '{{ $labels.bot }}'
       description: 'Swarming bot {{ $labels.bot }} has not run a job in 72 hours. Maybe its dimensions need changing? https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'

   - alert: BotQuarantined
     expr: avg_over_time(swarming_bots_quarantined{device_state!~"(too_hot)|(low_battery)",bot!="skia-rpi-template"}[10m]) >= 1
     labels:
       category: infra
       severity: critical
       abbr_owner_regex: rmistry@google.com:^build[0-9]+-m5$
     annotations:
       abbr: '{{ $labels.bot }}'
       description: 'Swarming bot {{ $labels.bot }} is quarantined. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'

   - alert: DeviceUnhealthy
     expr: avg(avg_over_time(swarming_bots_quarantined{device_state=~"(too_hot)|(low_battery)",bot!="skia-rpi-template"}[1h])) by (swarming, bot, device_state) >= 1
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.bot }}'
       description: 'Swarming bot {{ $labels.bot }} is quarantined because the device is {{ $labels.device_state }} and has not resolved itself in 1+ hours. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'

   - alert: BotUptime
     expr: swarming_bots_uptime_s{bot!="skia-rpi-template"} / 60 / 60 > 36
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.bot }}'
       description: 'Swarming bot {{ $labels.bot }} has gone too long without a reboot. Check the events on the Swarming bot page and reboot manually if necessary. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance'

 # Alerts for supported branches.

   - alert: MissingCQConfigForSupportedBranch
     expr: cq_cfg_branch_exists == 0
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.branch }}'
       description: 'There is no commit queue config entry for supported branch {{ $labels.branch }} in {{ $labels.repo }}. Either an entry needs to be added to {{ $labels.repo }}/+/infra/config/commit-queue.cfg or the branch needs to be marked as not supported in {{ $labels.repo }}/+/infra/config/supported-branches.json'

   - alert: NoSuchCQTryjobForSupportedBranch
     expr: cq_cfg_tryjob_exists == 0
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.branch }}'
       description: 'The commit queue config for supported branch {{ $labels.branch }} in {{ $labels.repo }} references unknown job {{ $labels.tryjob }}. Either the job needs to be removed or renamed in {{ $labels.repo }}/+/infra/config/commit-queue.cfg or the job needs to be added to {{ $labels.repo }}/+/{{ $labels.branch}}/infra/bots/tasks.json'

   - alert: NoBotsExistForTryjobOnSupportedBranch
     expr: cq_cfg_bot_exists_for_tryjob == 0
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.branch }}'
       description: 'There are no bots which can run the tasks for {{ $labels.tryjob }} on supported branch {{ $labels.branch }} in {{ $labels.repo }}.  Either the dimensions for the tasks used by the job need to be updated in {{ $labels.repo }}/+/{{ $labels.branch}}/infra/bots/tasks.json or bots need to be added which can run the tasks. See the logs for details: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}'

 # K8s Checker
   - alert: K8sCheckerLiveness
     expr: liveness_k8s_checker_s > 300
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'k8s_checker has failed to run in the last 5 minutes.'

   - alert: DirtyCommittedK8sImage
     expr: dirty_committed_image_metric == 1
     for: 2h
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.yaml }}'
       description: 'There is a dirty committed image {{ $labels.committedImage }} in {{ $labels.repo }}/+/refs/heads/master/{{ $labels.yaml }}'

   - alert: DirtyRunningK8sConfig
     expr: dirty_config_metric == 1
     for: 2h
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.container }}'
       description: 'For app {{ $labels.exported_app }} and container {{ $labels.container }} the running image differs from the image in {{ $labels.repo }}/+/refs/heads/master/{{ $labels.yaml }} : {{ $labels.liveImage }} != {{ $labels.committedImage }}'

   - alert: StaleK8sImage
     expr: stale_image_metric > 30
     for: 15m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.container }}'
       description: 'For app {{ $labels.exported_app }} and container {{ $labels.container }} the running/committed image {{ $labels.liveImage }} is > 30 days old. Should push to it soon to pick up new changes and ensure things continue to work'

   - alert: CheckedInK8sAppNotRunning
     expr: app_running_metric == 0
     for: 15m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_app }}'
       description: 'The app {{ $labels.exported_app }} is checked into {{ $labels.repo }}/+/refs/heads/master/{{ $labels.yaml }} but is not running in {{ $labels.project }}'

   - alert: CheckedInK8sContainerNotRunning
     expr: container_running_metric == 0
     for: 15m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.container }}'
       description: 'The container {{ $labels.container }} of app {{ $labels.exported_app }} is checked into {{ $labels.repo }}/+/refs/heads/master/{{ $labels.yaml }} but is not running in {{ $labels.project }}'

   - alert: RunningK8sAppNotCheckedIn
     expr: running_app_has_config_metric == 0
     for: 15m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_app }}'
       description: 'The app {{ $labels.exported_app }} is running in {{ $labels.project }} but is not checked into {{ $labels.repo }}'

   - alert: RunningK8sContainerNotCheckedIn
     expr: running_container_has_config_metric == 0
     for: 15m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.container }}'
       description: 'The app {{ $labels.exported_app }} is running in {{ $labels.project }} but is not checked into {{ $labels.repo }}'

 # GitSync.
   - alert: GitSyncStalled
     expr: liveness_last_successful_git_sync_s > 5*60
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.repo }}'
       description: 'gitsync has failed to update {{ $labels.repo }} for the last 5 minutes. Check out pod {{ $labels.statefulset_kubernetes_io_pod_name }} in project {{ $labels.project }}. https://skia.googlesource.com/buildbot/+/refs/heads/master/gitsync/PROD.md#GitSyncStalled'

   - alert: GitSyncErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"gitsync.*"}[5m]) > 0.005
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for {{ $labels.app }} is too high.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }} https://skia.googlesource.com/buildbot/+/refs/heads/master/gitsync/PROD.md#GitSyncErrorRate'

 # Gold Alerts
   - alert: GoldIgnoreMonitoring
     expr: liveness_gold_expired_ignore_rules_monitoring_s > 20*60
     labels:
       category: infra
       severity: warning
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has not checked for expired ignore rules in a while. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldignoremonitoring'

   - alert: GoldPollingIngestionStalled
     expr: liveness_gold_bt_s{metric="since-last-run"} > 150*60
     labels:
       category: infra
       severity: warning
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has not been able to poll the bucket for missed files to ingest for a while. See https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldpollingingestionstalled'

   - alert: GoldStreamingIngestionStalled
     expr: liveness_gold_bt_s{metric="last-successful-process"} > 24*60*60
     labels:
       category: infra
       severity: warning
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} has not successfully ingested files via streaming in a long time. See https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldstreamingingestionstalled'

   - alert: GoldCommitTooOldWallTime
     expr: gold_last_commit_age_s{type="wall_time"} > 24*60*60
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'It has been at least 24 hours since the last commit made it into Gold for {{ $labels.app }}. Some process might have hung, or perhaps that repo simply has not seen a commit in that period. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcommittoooldwalltime'

   - alert: GoldCommitTooOldNewerCommit
     expr: gold_last_commit_age_s{type="with_new_commit"} > 60*60
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'It has been at least 1 hour since a new commit landed in {{ $labels.app }}, and Gold still has not picked it up. Some process might have hung. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcommittoooldnewercommit'

   - alert: GoldStatusStalled
     expr: liveness_gold_status_monitoring_s > 20*60
     labels:
       category: infra
       severity: warning
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'It has been at least 20 minutes since the Gold status was re-computed for {{ $labels.app }}. Some process might have hung. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldstatusstalled'

   - alert: GoldExpectationsStale
     expr: stopped_expstore_shards > 0
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Shards of QuerySnapshotIterators have stopped. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldexpectationsstale'

   - alert: GoldCorruptTryJobParamMaps
     expr: bad_param_maps > 0
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'TryJobResults may be missing Params. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcorrupttryjobdata'

   - alert: GoldTryJobResultsIncompleteData
     expr: bad_results > 0
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'There are a nonzero amount of TryJobResults that were missing Params. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcorrupttryjobdata'

   - alert: GoldNoDataAtHead
     expr: gold_empty_commits_at_head{appgroup!~"gold-chrome.*"} >= 20
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The most recent 20 commits for {{ $labels.app }} have no data. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldnodataathead'

   - alert: GoldTooManyCLs
     expr: gold_num_recent_open_cls{appgroup!~"gold-chrome.*"} >= 50
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'There are more than 50 recent, open CLs with tryjob data for {{ $labels.app }}. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldtoomanycls'

   - alert: GoldTooManyCLsChrome
     expr: gold_num_recent_open_cls{appgroup=~"gold-chrome.*"} >= 200
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'There are more than 200 recent, open CLs with tryjob data for {{ $labels.app }}. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldtoomanycls'

   - alert: GoldCommentingStalled
     expr: liveness_gold_comment_monitoring_s > 20*60
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'It has been at least 20 minutes since the Gold went through all open CLs in for {{ $labels.app }} to maybe comment them on. Some process might have hung. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcommentingstalled'


 # Velero Backup
   - alert: VeleroBackupFailed
     expr: rate(velero_backup_failure_total[24h])*(24*60*60) > 1
     labels:
       category: infra
       severity: error
       owner: jcgregorio@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'It has been more than 24 hours since a successful backup in {{ $labels.project }}. See https://skia.googlesource.com/buildbot/%2B/master/velero/PROD.md#backup_failed'

 # Git is supposed to be obtained through CIPD.
   - alert: GitNotFromCIPD
     expr: git_from_cipd == 0
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.app }}'
       description: '{{ $labels.app }} is using a version of Git which is not obtained via CIPD.'