promk/prometheus/alerts_public.yml - buildbot - Git at Google

 # Alerts for things in the skia-public cluster only.
 #
 # If anything in this file starts to run in another cluster, such as
 # skia-corp, then break it out into its own alerts_NNNN.yml file
 # and include it in each prometheus-CLUSTER.yml file that is it running in.
 groups:
 - name: general
   rules:

   # This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs
   # for processes with this metric.
   - alert: CrashLoop
     expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.instance }}'
       description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
         startup. Logs:

           kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

           https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22k8s_container%22%0Aresource.labels.pod_name%3D%22{{ $labels.kubernetes_pod_name }}%22"
         '

   # Docker Pushes Watcher
   - alert: DockerPushesWatcherLiveness
     expr: liveness_docker_watcher_pubsub_receive_s > 60 * 60 * 24 * 2
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'Docker pushes watcher has failed to recieve a pubsub event in the last 48 hours.

           https://console.cloud.google.com/logs/viewer?project=skia-public&resource=k8s_container%2Fcluster_name%2Fskia-public%2Fnamespace_name%2Fdefault%2Fcontainer_name%2Fdocker-pushes-watcher
         '

   - alert: DockerPushesWatcherTagFailures
     expr: docker_watcher_tag_failure > 0
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       abbr: '{{ $labels.image }}'
       description: 'Docker pushes watcher has failed to successfully add "prod" tag to the docker image of {{ $labels.image }} in the repo {{ $labels.repo }}

           https://console.cloud.google.com/logs/viewer?project=skia-public&resource=k8s_container%2Fcluster_name%2Fskia-public%2Fnamespace_name%2Fdefault%2Fcontainer_name%2Fdocker-pushes-watcher
         '

   - alert: DockerPushesWatcherPushFailures
     expr: docker_watcher_push_failure > 0
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       abbr: '{{ $labels.image }}'
       description: 'Docker pushes watcher has failed to successfully pushk the docker image of {{ $labels.image }} in the repo {{ $labels.repo }}

           https://console.cloud.google.com/logs/viewer?project=skia-public&resource=k8s_container%2Fcluster_name%2Fskia-public%2Fnamespace_name%2Fdefault%2Fcontainer_name%2Fdocker-pushes-watcher
         '

   # Fiddle
   - alert: InsufficientFiddlerPods
     expr: avg_over_time(pods_idle[2m]) < 5
     for: 15m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/fiddlek/PROD.md#fiddler_pods'

   - alert: NamedFiddlesFailing
     expr: named_fiddles_errors_in_examples_run > 0
     for: 15m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Some named fiddles are failing to compile or run. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/fiddlek/PROD.md#NamedFiddlesFailing'

   - alert: InvalidNamedFiddles
     expr: named_fiddles_examples_total_invalid > 0
     for: 15m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Some named fiddles are failing to be parsed. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/fiddlek/PROD.md#InvalidNamedFiddles'

   - alert: FiddlerPodCommunicationErrors
     expr: rate(run_exhaustion[20m]) * 20 * 60 > 5
     for: 5m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Fiddle is having trouble communicating with fiddler pods.'

   # CQ Watcher
   - alert: TooManyCLsInCQ
     expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       description: 'There are 10 CLs or more waiting in the CQ. Dry run queue: https://skia-review.googlesource.com/q/label:Commit-Queue%253D1+status:open and Commit queue: https://skia-review.googlesource.com/q/label:Commit-Queue%253D2+status:open Playbook: https://skia.googlesource.com/buildbot/%2B/main/cq_watcher/PROD.md#too_many_cls'

   - alert: CQTrybotRunningTooLong
     expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.trybot }}'
       description: '{{ $labels.trybot }} ran longer than 45 mins on {{ $labels.gerritURL }} Playbook: https://skia.googlesource.com/buildbot/%2B/main/cq_watcher/PROD.md#trybot_duration_beyond_threshold'

   - alert: TooManyCQTrybotsForCL
     expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 100
     labels:
       category: infra
       severity: warning
     annotations:
       description: 'There are more than 100 CQ trybots triggered by {{ $labels.gerritURL }} Playbook: https://skia.googlesource.com/buildbot/%2B/main/cq_watcher/PROD.md#too_many_trybots_triggered'

  # datatore backups
   - alert: BackupNotDone
     expr: liveness_backup_success_s/60/60/24 > 7
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: skia-public
       description: 'A backup of Cloud Datastore has not succeeded in the last week. https://skia.googlesource.com/buildbot/%2B/main/ds/PROD.md#backup_not_done'

 # alert-to-pubsub liveness
   - alert: AlertToPubSubLiveness
     expr: (min(liveness_alive_s{location=~".+"}) by (location)) > 90
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       abbr: '{{ $labels.location }}'
       description: 'alert-to-pubsub for {{ $labels.location }} has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/main/am/PROD.md#alert_to_pubsub'

 # CT
 # TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public.
   - alert: CTFEPendingTaskCount
     expr: num_pending_tasks{app="ctfe"} >= 10
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/main/ct/PROD.md#ctfe_pending_tasks'

   - alert: CTFEPendingTaskNotRunning
     expr: oldest_pending_task_status{app="ctfe"} >= 2
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/main/ct/PROD.md#ctfe_pending_tasks'

   - alert: AutoRollLatency
     expr: prober{type="latency",probename="autoroll"} > 200
     for: 10m
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#http_latency'

   - alert: FlutterLicenseScriptFailure
     expr: flutter_license_script_failure{app="autoroll-be-skia-flutter-autoroll"} > 0
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'The License scripts in the Skia->Flutter roller have failed.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}
       https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#flutter_license_script_failure'

 # skia-flutter-autoroll takes a long time to transition because its pre-upload
 # scripts run flutter's license script which can take around 40 minutes.
   - alert: AutoRollLastTransition
     expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 50*60
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       abbr: 'skia-flutter-autoroll'
       description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 50 minutes.
       https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}
       '

 # Perf
   - alert: AndroidIngestStalled
     expr: rate(ack[30m]) < 0.01
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'The Android Perf instances that run using --event_driven_regression_detection have stopped receiving data. Confirm via https://android-metric-ingest.skia.org/ that data is arriving.'

   - alert: AndroidRegressionDetectionTooSlow
     expr: sum(rate(perf_regression_store_found{app=~"perf-clustering-android|skiaperf|skiaperf-android-x"}[1h])) by (app) < 0.01
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Perf failed to find any regressions in an hour. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#regression_detection_slow'

   - alert: AndroidIngestFailures
     expr: rate(process_failures[1h]) > 0.01
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#process_failures'

   - alert: AndroidIngestLiveness
     expr: liveness_last_successful_add_s > 300
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Liveness for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#liveness'


   - alert: AndroidIngestBadFilesTooHigh
     expr: bad_files/uploads > 0.5
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Error rate for bad_files is too high. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#bad_files'

   - alert: AndroidTxLogFailures
     expr: tx_log_write_failure > 0
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#tx_log'

   - alert: PerfIngestionSuccessRateTooLow
     expr: rate(perfserver_ingest_successful_write[1h]) / rate(perfserver_ingest_files_received[1h])  < 0.95
     for: 10m
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Too few ingestion files are being successfully ingested. See https://github.com/google/skia-buildbot/blob/main/perf/PROD.md#success_rate_too_low'

   - alert: PerfAndroidClusteringSlow
     expr: sum(rate(perf_clustering_runs{app="perf-clustering-android"}[1h])*60*60) < 1
     for: 15m
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Android Perf Clustering (https://android-main-perf.skia.org)Rate is too low. See https://github.com/google/skia-buildbot/blob/main/perf/PROD.md#android_clustering_rate'

   - alert: PerfClusteringSlow
     expr: rate(perf_clustering_runs{app!="skiaperf-android"}[1h])*60*60 < 1
     for: 60m
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Perf Clustering Rate is too low. See https://github.com/google/skia-buildbot/blob/main/perf/PROD.md#clustering_rate'

 # Prober
   - alert: ProbeFailure
     expr: prober{type="failure"} > 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.probename }} {{ $labels.url }}'
       description: 'Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to
         respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{
         $labels.probename }}+filename%3Aprobersk.json5 for the endpoint URL.'

   - alert: ProberLiveness
     expr: liveness_probes_s > 300
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       description: 'The prober has failed to probe in the last 5 minutes.'


 # Grafana Backup
   - alert: GrafanaBackupLiveness
     expr: liveness_backup_s > 60*60*25
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       description: 'backup-to-gcs has failed to back up the Grafana db in the last 24 hours. Check the logs.'

 # Skia Status
   - alert: StatusLatency
     expr: avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024  > 10
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       description: 'The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond.'

 # Datahopper

   - alert: FirestoreBackupTooOld
     expr: liveness_last_successful_firestore_backup_s{app="datahopper"}/60/60/24 > 7
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       description: 'The most recent successful Firestore weekly backup was more than 7 days ago. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#firestore_weekly_backup'

   - alert: GoDEPSTooOld
     expr: liveness_last_file_modification_s{file="go.mod"}/60/60/24 > 3
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.file }} in {{ $labels.repo }} too old'
       description: 'The {{ $labels.file }} file in {{ $labels.repo }} was last updated more than 3 days ago. https://skia.googlesource.com/buildbot/%2B/main/infra/bots/task_drivers/update_go_deps/update_go_deps.go'

 # External RPis.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df=~"var|tmp",exported_instance=~"skia-(e-)?rpi-.+",type="free"} < 1e8
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.exported_instance }}  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # External bots except RPis.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df="root",exported_instance=~"skia-e-[^r].+",type="free"} < 1e9
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.exported_instance }}  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # Dev RPis.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df=~"var|tmp",exported_instance=~"skia-d-rpi-.+",type="free"} < 1e8
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.exported_instance }}  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # Dev bots except RPis.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df="root",exported_instance=~"skia-d-[^r].+",type="free"} < 1e9
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.exported_instance }}  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # Internal RPis.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df=~"var|tmp",exported_instance=~"skia-i-rpi-.+",type="free"} < 1e8
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.exported_instance }}  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # Internal bots except RPis.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df="root",exported_instance=~"skia-i-[^r].+",type="free"} < 1e9
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.exported_instance }}  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # GCE machines (other than bots), root disk.
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df="root",exported_instance!~"skia-(e|i|d|rpi)-.+",type="free"} < 1e9
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Low Root Disk Space on {{ $labels.exported_instance }}.  https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

 # GCE bots, /b (aka /mnt/pd0)
   - alert: DiskSpaceLow
     expr: collectd_df_df_complex{df="mnt-pd0", exported_instance=~"skia-e-gce.+",type="free"} < 1e10
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.exported_instance }}'
       description: 'Low Disk Space on /b for {{ $labels.exported_instance }}. https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'


 # Envoy
   - alert: EnvoyClusterBindError
     expr: envoy_cluster_bind_errors > 0
     for: 5m
     labels:
       category: infra
       severity: error
     annotations:
       abbr: '{{ $labels.envoy_cluster_name }}'
       description: 'Envoy Cluster Bind Error for {{ $labels.envoy_cluster_name
       }}. https://skia.googlesource.com/buildbot/+doc/main/skfe/PROD.md#cluster_bind_error'

   - alert: EnvoyRuntimeLoadError
     expr: envoy_runtime_load_error > 0
     for: 5m
     labels:
       category: infra
       severity: error
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'Envoy Runtime Load Error for {{ $labels.app }}. https://skia.googlesource.com/buildbot/%2B/main/skfe/PROD.md#runtime_load_error'

   - alert: EnvoyClusterNotOK
     expr: envoy_cluster_lb_local_cluster_not_ok > 0
     for: 5m
     labels:
       category: infra
       severity: error
     annotations:
       abbr: '{{ $labels.envoy_cluster_name }}'
       description: 'Envoy Cluster Not OK for {{ $labels.envoy_cluster_name }}. https://skia.googlesource.com/buildbot/%2B/main/skfe/PROD.md#envoy_cluster_lb_local_cluster_not_ok'

 # Push
   - alert: DirtyPackages
     expr: min_over_time(dirty_packages[25h]) >= 1
     labels:
       category: infra
       severity: warning
     annotations:
       description: 'One or more dirty packages have been running for more than 24 hours. https://push.skia.org'

 # Docsyserver
   - alert: SkiaDotOrgRefreshFail
     expr: liveness_docsy_docset_refresh_s > 600
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'docsyserver has failed to successfully refresh in the last 10 minutes. Check the logs for docsyserver and look for errors.'
	# Alerts for things in the skia-public cluster only.
	#
	# If anything in this file starts to run in another cluster, such as
	# skia-corp, then break it out into its own alerts_NNNN.yml file
	# and include it in each prometheus-CLUSTER.yml file that is it running in.
	groups:
	- name: general
	rules:

	# This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs
	# for processes with this metric.
	- alert: CrashLoop
	expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.instance }}'
	description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
	startup. Logs:

	kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

	https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22k8s_container%22%0Aresource.labels.pod_name%3D%22{{ $labels.kubernetes_pod_name }}%22"
	'

	# Docker Pushes Watcher
	- alert: DockerPushesWatcherLiveness
	expr: liveness_docker_watcher_pubsub_receive_s > 60 * 60 * 24 * 2
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'Docker pushes watcher has failed to recieve a pubsub event in the last 48 hours.

	https://console.cloud.google.com/logs/viewer?project=skia-public&resource=k8s_container%2Fcluster_name%2Fskia-public%2Fnamespace_name%2Fdefault%2Fcontainer_name%2Fdocker-pushes-watcher
	'

	- alert: DockerPushesWatcherTagFailures
	expr: docker_watcher_tag_failure > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	abbr: '{{ $labels.image }}'
	description: 'Docker pushes watcher has failed to successfully add "prod" tag to the docker image of {{ $labels.image }} in the repo {{ $labels.repo }}

	https://console.cloud.google.com/logs/viewer?project=skia-public&resource=k8s_container%2Fcluster_name%2Fskia-public%2Fnamespace_name%2Fdefault%2Fcontainer_name%2Fdocker-pushes-watcher
	'

	- alert: DockerPushesWatcherPushFailures
	expr: docker_watcher_push_failure > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	abbr: '{{ $labels.image }}'
	description: 'Docker pushes watcher has failed to successfully pushk the docker image of {{ $labels.image }} in the repo {{ $labels.repo }}

	https://console.cloud.google.com/logs/viewer?project=skia-public&resource=k8s_container%2Fcluster_name%2Fskia-public%2Fnamespace_name%2Fdefault%2Fcontainer_name%2Fdocker-pushes-watcher
	'

	# Fiddle
	- alert: InsufficientFiddlerPods
	expr: avg_over_time(pods_idle[2m]) < 5
	for: 15m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/fiddlek/PROD.md#fiddler_pods'

	- alert: NamedFiddlesFailing
	expr: named_fiddles_errors_in_examples_run > 0
	for: 15m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Some named fiddles are failing to compile or run. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/fiddlek/PROD.md#NamedFiddlesFailing'

	- alert: InvalidNamedFiddles
	expr: named_fiddles_examples_total_invalid > 0
	for: 15m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Some named fiddles are failing to be parsed. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/fiddlek/PROD.md#InvalidNamedFiddles'

	- alert: FiddlerPodCommunicationErrors
	expr: rate(run_exhaustion[20m]) * 20 * 60 > 5
	for: 5m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Fiddle is having trouble communicating with fiddler pods.'

	# CQ Watcher
	- alert: TooManyCLsInCQ
	expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	description: 'There are 10 CLs or more waiting in the CQ. Dry run queue: https://skia-review.googlesource.com/q/label:Commit-Queue%253D1+status:open and Commit queue: https://skia-review.googlesource.com/q/label:Commit-Queue%253D2+status:open Playbook: https://skia.googlesource.com/buildbot/%2B/main/cq_watcher/PROD.md#too_many_cls'

	- alert: CQTrybotRunningTooLong
	expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.trybot }}'
	description: '{{ $labels.trybot }} ran longer than 45 mins on {{ $labels.gerritURL }} Playbook: https://skia.googlesource.com/buildbot/%2B/main/cq_watcher/PROD.md#trybot_duration_beyond_threshold'

	- alert: TooManyCQTrybotsForCL
	expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 100
	labels:
	category: infra
	severity: warning
	annotations:
	description: 'There are more than 100 CQ trybots triggered by {{ $labels.gerritURL }} Playbook: https://skia.googlesource.com/buildbot/%2B/main/cq_watcher/PROD.md#too_many_trybots_triggered'

	# datatore backups
	- alert: BackupNotDone
	expr: liveness_backup_success_s/60/60/24 > 7
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: skia-public
	description: 'A backup of Cloud Datastore has not succeeded in the last week. https://skia.googlesource.com/buildbot/%2B/main/ds/PROD.md#backup_not_done'

	# alert-to-pubsub liveness
	- alert: AlertToPubSubLiveness
	expr: (min(liveness_alive_s{location=~".+"}) by (location)) > 90
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	abbr: '{{ $labels.location }}'
	description: 'alert-to-pubsub for {{ $labels.location }} has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/main/am/PROD.md#alert_to_pubsub'

	# CT
	# TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public.
	- alert: CTFEPendingTaskCount
	expr: num_pending_tasks{app="ctfe"} >= 10
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/main/ct/PROD.md#ctfe_pending_tasks'

	- alert: CTFEPendingTaskNotRunning
	expr: oldest_pending_task_status{app="ctfe"} >= 2
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/main/ct/PROD.md#ctfe_pending_tasks'

	- alert: AutoRollLatency
	expr: prober{type="latency",probename="autoroll"} > 200
	for: 10m
	labels:
	category: infra
	severity: critical
	owner: borenet@google.com
	annotations:
	description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#http_latency'

	- alert: FlutterLicenseScriptFailure
	expr: flutter_license_script_failure{app="autoroll-be-skia-flutter-autoroll"} > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'The License scripts in the Skia->Flutter roller have failed.
	https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}
	https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#flutter_license_script_failure'

	# skia-flutter-autoroll takes a long time to transition because its pre-upload
	# scripts run flutter's license script which can take around 40 minutes.
	- alert: AutoRollLastTransition
	expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 50*60
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	abbr: 'skia-flutter-autoroll'
	description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 50 minutes.
	https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}
	'

	# Perf
	- alert: AndroidIngestStalled
	expr: rate(ack[30m]) < 0.01
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'The Android Perf instances that run using --event_driven_regression_detection have stopped receiving data. Confirm via https://android-metric-ingest.skia.org/ that data is arriving.'

	- alert: AndroidRegressionDetectionTooSlow
	expr: sum(rate(perf_regression_store_found{app=~"perf-clustering-android\|skiaperf\|skiaperf-android-x"}[1h])) by (app) < 0.01
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Perf failed to find any regressions in an hour. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#regression_detection_slow'

	- alert: AndroidIngestFailures
	expr: rate(process_failures[1h]) > 0.01
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#process_failures'

	- alert: AndroidIngestLiveness
	expr: liveness_last_successful_add_s > 300
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Liveness for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#liveness'


	- alert: AndroidIngestBadFilesTooHigh
	expr: bad_files/uploads > 0.5
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Error rate for bad_files is too high. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#bad_files'

	- alert: AndroidTxLogFailures
	expr: tx_log_write_failure > 0
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/main/android_ingest/PROD.md#tx_log'

	- alert: PerfIngestionSuccessRateTooLow
	expr: rate(perfserver_ingest_successful_write[1h]) / rate(perfserver_ingest_files_received[1h]) < 0.95
	for: 10m
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Too few ingestion files are being successfully ingested. See https://github.com/google/skia-buildbot/blob/main/perf/PROD.md#success_rate_too_low'

	- alert: PerfAndroidClusteringSlow
	expr: sum(rate(perf_clustering_runs{app="perf-clustering-android"}[1h])6060) < 1
	for: 15m
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Android Perf Clustering (https://android-main-perf.skia.org)Rate is too low. See https://github.com/google/skia-buildbot/blob/main/perf/PROD.md#android_clustering_rate'

	- alert: PerfClusteringSlow
	expr: rate(perf_clustering_runs{app!="skiaperf-android"}[1h])6060 < 1
	for: 60m
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Perf Clustering Rate is too low. See https://github.com/google/skia-buildbot/blob/main/perf/PROD.md#clustering_rate'

	# Prober
	- alert: ProbeFailure
	expr: prober{type="failure"} > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.probename }} {{ $labels.url }}'
	description: 'Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to
	respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{
	$labels.probename }}+filename%3Aprobersk.json5 for the endpoint URL.'

	- alert: ProberLiveness
	expr: liveness_probes_s > 300
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	description: 'The prober has failed to probe in the last 5 minutes.'


	# Grafana Backup
	- alert: GrafanaBackupLiveness
	expr: liveness_backup_s > 606025
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	description: 'backup-to-gcs has failed to back up the Grafana db in the last 24 hours. Check the logs.'

	# Skia Status
	- alert: StatusLatency
	expr: avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024 > 10
	labels:
	category: infra
	severity: critical
	owner: borenet@google.com
	annotations:
	description: 'The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond.'

	# Datahopper

	- alert: FirestoreBackupTooOld
	expr: liveness_last_successful_firestore_backup_s{app="datahopper"}/60/60/24 > 7
	labels:
	category: infra
	severity: critical
	owner: borenet@google.com
	annotations:
	description: 'The most recent successful Firestore weekly backup was more than 7 days ago. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#firestore_weekly_backup'

	- alert: GoDEPSTooOld
	expr: liveness_last_file_modification_s{file="go.mod"}/60/60/24 > 3
	labels:
	category: infra
	severity: critical
	owner: borenet@google.com
	annotations:
	abbr: '{{ $labels.file }} in {{ $labels.repo }} too old'
	description: 'The {{ $labels.file }} file in {{ $labels.repo }} was last updated more than 3 days ago. https://skia.googlesource.com/buildbot/%2B/main/infra/bots/task_drivers/update_go_deps/update_go_deps.go'

	# External RPis.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df=~"var\|tmp",exported_instance=~"skia-(e-)?rpi-.+",type="free"} < 1e8
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# External bots except RPis.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df="root",exported_instance=~"skia-e-[^r].+",type="free"} < 1e9
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# Dev RPis.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df=~"var\|tmp",exported_instance=~"skia-d-rpi-.+",type="free"} < 1e8
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# Dev bots except RPis.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df="root",exported_instance=~"skia-d-[^r].+",type="free"} < 1e9
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# Internal RPis.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df=~"var\|tmp",exported_instance=~"skia-i-rpi-.+",type="free"} < 1e8
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# Internal bots except RPis.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df="root",exported_instance=~"skia-i-[^r].+",type="free"} < 1e9
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# GCE machines (other than bots), root disk.
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df="root",exported_instance!~"skia-(e\|i\|d\|rpi)-.+",type="free"} < 1e9
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'

	# GCE bots, /b (aka /mnt/pd0)
	- alert: DiskSpaceLow
	expr: collectd_df_df_complex{df="mnt-pd0", exported_instance=~"skia-e-gce.+",type="free"} < 1e10
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.exported_instance }}'
	description: 'Low Disk Space on /b for {{ $labels.exported_instance }}. https://skia.googlesource.com/buildbot/+doc/main/docs/PROD.md#diskspacelow'


	# Envoy
	- alert: EnvoyClusterBindError
	expr: envoy_cluster_bind_errors > 0
	for: 5m
	labels:
	category: infra
	severity: error
	annotations:
	abbr: '{{ $labels.envoy_cluster_name }}'
	description: 'Envoy Cluster Bind Error for {{ $labels.envoy_cluster_name
	}}. https://skia.googlesource.com/buildbot/+doc/main/skfe/PROD.md#cluster_bind_error'

	- alert: EnvoyRuntimeLoadError
	expr: envoy_runtime_load_error > 0
	for: 5m
	labels:
	category: infra
	severity: error
	annotations:
	abbr: '{{ $labels.app }}'
	description: 'Envoy Runtime Load Error for {{ $labels.app }}. https://skia.googlesource.com/buildbot/%2B/main/skfe/PROD.md#runtime_load_error'

	- alert: EnvoyClusterNotOK
	expr: envoy_cluster_lb_local_cluster_not_ok > 0
	for: 5m
	labels:
	category: infra
	severity: error
	annotations:
	abbr: '{{ $labels.envoy_cluster_name }}'
	description: 'Envoy Cluster Not OK for {{ $labels.envoy_cluster_name }}. https://skia.googlesource.com/buildbot/%2B/main/skfe/PROD.md#envoy_cluster_lb_local_cluster_not_ok'

	# Push
	- alert: DirtyPackages
	expr: min_over_time(dirty_packages[25h]) >= 1
	labels:
	category: infra
	severity: warning
	annotations:
	description: 'One or more dirty packages have been running for more than 24 hours. https://push.skia.org'

	# Docsyserver
	- alert: SkiaDotOrgRefreshFail
	expr: liveness_docsy_docset_refresh_s > 600
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	abbr: '{{ $labels.app }}'
	description: 'docsyserver has failed to successfully refresh in the last 10 minutes. Check the logs for docsyserver and look for errors.'