promk/prometheus/alerts_public.yml - buildbot - Git at Google

 # Alerts for things in the skia-public cluster only.
 #
 # If anything in this file starts to run in another cluster, such as
 # skia-corp, then break it out into its own alerts_NNNN.yml file
 # and include it in each prometheus-CLUSTER.yml file that is it running in.
 groups:
 - name: general
   rules:

   # This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs
   # for processes with this metric.
   - alert: CrashLoop
     expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.instance }}'
       description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
         startup. Logs:

           kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

           https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
         '

   # Container Builder
   - alert: ContainerBuilderFailure
     expr: ci_build_failure >= 2
     labels:
       category: infra
       severity: warning
     annotations:
       abbr: '{{ $labels.trigger }}'
       description: 'The build with trigger name {{ $labels.trigger }} has failed when rebuilding twice in a row.'

   # Continuous Deploy
   - alert: ContinuousDeployLiveness
     expr: liveness_ci_pubsub_receive_s > 60 * 60 * 24 * 2
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       description: 'Continuous deploy has failed to recieve a pubsub event in the last 48 hours.

           https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcontinuous-deploy&minLogLevel=400
         '

   - alert: ContinuousDeployFailures
     expr: ci_push_failure > 2
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       description: 'Continuous deploy has failed to successfully push two or more times in a row.

           https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcontinuous-deploy&minLogLevel=400
         '

   # Fiddle
   - alert: InsufficientFiddlerPods
     expr: avg_over_time(pods_idle[2m]) < 5
     for: 15m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/%2B/master/fiddlek/PROD.md#fiddler_pods'

   - alert: NamedFiddlesFailing
     expr: named_fiddles_total_invalid > 0
     for: 15m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Some named fiddles are failing. Visit https://named-fiddles.skia.org to see which ones.'

   - alert: FiddlerPodCommunicationErrors
     expr: rate(run_exhaustion[20m]) * 20 * 60 > 5
     for: 5m
     labels:
       category: infra
       severity: warning
       owner: jcgregorio@google.com
     annotations:
       description: 'Fiddle is having trouble communicating with fiddler pods.'

   # CQ Watcher
   - alert: TooManyCLsInCQ
     expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10
     for: 5m
     labels:
       category: infra
       severity: warning
     annotations:
       description: 'There are 10 CLs or more in a Skia CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls'

   # Update CQ_TRYBOT_DURATION_SECS_THRESHOLD in go/cq/cq.go if the number below is changed.
   - alert: CQTrybotRunningTooLong
     expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700
     labels:
       category: infra
       severity: warning
       abbr_owner_regex: rmistry@google.com:Build-.*-Android_Framework
     annotations:
       abbr: '{{ $labels.trybot }}'
       description: '{{ $labels.trybot }} ran longer than 45 mins. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold Direct link to logs: https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcq-watcher&filters=text:CQTrybotDurationError'

   # Update CQ_TRYBOTS_COUNT_THRESHOLD in go/cq/cq.go if the number below is changed.
   - alert: TooManyCQTrybotsForCL
     expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 50
     labels:
       category: infra
       severity: warning
     annotations:
       description: 'There are more than 50 CQ trybots triggered by at least one CL. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered Direct link to logs: https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcq-watcher&filters=text:CQCLsCountError'

   # Fuzzer
   - alert: FuzzerUploadQueueFull
     expr: fuzzer_queue_size_upload > 90
     for: 2m
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.kubernetes_pod_name }}'
       description: 'Fuzzer upload queue has been very full on {{ $labels.kubernetes_pod_name }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload'

   - alert: FuzzerVersionStale
     expr: fuzzer_version_age{type="current"}/60/60/24 > 10
     labels:
       category: infra
       severity: warning
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.kubernetes_pod_name }}'
       description: 'The Fuzzer on {{ $labels.kubernetes_pod_name }} hasnt rolled its version forward in 10 days.  Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version'

   - alert: FuzzerSlowRoll
     expr: fuzzer_version_age{type="pending"}/60/60 > 2
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.kubernetes_pod_name }}'
       description: 'The fuzzer on {{ $labels.kubernetes_pod_name }} hasnt finished rolling its version forward in 2 hours.  Something might be wrong.  https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll'

   - alert: FuzzerAnalysisQueueFull
     expr: fuzzer_queue_size_analysis > 900000
     for: 2m
     labels:
       category: infra
       severity: critical
       owner: kjlubick@google.com
     annotations:
       abbr: '{{ $labels.kubernetes_pod_name }}'
       description: 'Fuzzer analysis queue has been very full on {{ $labels.kubernetes_pod_name }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis'

 # datatore backups
   - alert: BackupNotDone
     expr: liveness_backup_success_s/60/60 > 25
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: skia-public
       description: 'A backup of Cloud Datastore has not succeeded in the last 25 hours. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done'

 # alert-to-pubsub liveness
   - alert: AlertToPubSubLiveness
     expr: (min(liveness_alive_s) by (location)) > 90
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'alert-to-pubsub has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/master/am/PROD.md#alert_to_pubsub'

 # CT
 # TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public.
   - alert: CTPollerHealthCheckFailed
     expr: healthy{app="ct-master"} != 1
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check'

   - alert: CTFEPendingTaskCount
     expr: num_pending_tasks{app="ctfe"} >= 10
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'

   - alert: CTFEPendingTaskNotRunning
     expr: oldest_pending_task_status{app="ctfe"} >= 2
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'

   - alert: AutoRollLatency
     expr: prober{type="latency",probename="autoroll"} > 200
     for: 10m
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency'

   - alert: AutoRollFrontendErrorRate
     expr: rate(num_log_lines{level="ERROR",app=~"autoroll-fe.*"}[1h]) > 0.001
     labels:
       category: infra
       severity: critical
       owner: borenet@google.com
     annotations:
       abbr: '{{ $labels.app }}'
       description: 'The error rate for autoroll on {{ $labels.app }} is too high.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
       https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'


   - alert: FlutterLicenseScriptFailure
     expr: flutter_license_script_failure{app="autoroll-be-skia-flutter-autoroll"} > 0
     for: 5m
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       description: 'The License scripts in the Skia->Flutter roller have failed.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
       https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#flutter_license_script_failure'

 # skia-flutter-autoroll takes a long time to transition because its pre-upload
 # scripts run flutter's license script which can take around 40 minutes.
   - alert: AutoRollLastTransition
     expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 50*60
     labels:
       category: infra
       severity: critical
       owner: rmistry@google.com
     annotations:
       abbr: 'skia-flutter-autoroll'
       description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 50 minutes.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
       '

 # Perf
   - alert: AndroidIngestFailures
     expr: rate(process_failures[1h]) > 0
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures'

   - alert: AndroidTxLogFailures
     expr: tx_log_write_failure > 0
     labels:
       category: infra
       severity: critical
       owner: jcgregorio@google.com
     annotations:
       description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#tx_log'

 # Prober
   - alert: ProbeFailure
     expr: prober{type="failure"} > 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.probename }} {{ $labels.url }}'
       description: 'Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to
         respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{
         $labels.probename }}+filename%3Aprobersk.json5 for the endpoint URL.'

   - alert: ProberLiveness
     expr: liveness_probes_s > 300
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       description: 'The prober has failed to probe in the last 5 minutes.'
	# Alerts for things in the skia-public cluster only.
	#
	# If anything in this file starts to run in another cluster, such as
	# skia-corp, then break it out into its own alerts_NNNN.yml file
	# and include it in each prometheus-CLUSTER.yml file that is it running in.
	groups:
	- name: general
	rules:

	# This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs
	# for processes with this metric.
	- alert: CrashLoop
	expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.instance }}'
	description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
	startup. Logs:

	kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

	https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
	'

	# Container Builder
	- alert: ContainerBuilderFailure
	expr: ci_build_failure >= 2
	labels:
	category: infra
	severity: warning
	annotations:
	abbr: '{{ $labels.trigger }}'
	description: 'The build with trigger name {{ $labels.trigger }} has failed when rebuilding twice in a row.'

	# Continuous Deploy
	- alert: ContinuousDeployLiveness
	expr: liveness_ci_pubsub_receive_s > 60 * 60 * 24 * 2
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	description: 'Continuous deploy has failed to recieve a pubsub event in the last 48 hours.

	https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcontinuous-deploy&minLogLevel=400
	'

	- alert: ContinuousDeployFailures
	expr: ci_push_failure > 2
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	description: 'Continuous deploy has failed to successfully push two or more times in a row.

	https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcontinuous-deploy&minLogLevel=400
	'

	# Fiddle
	- alert: InsufficientFiddlerPods
	expr: avg_over_time(pods_idle[2m]) < 5
	for: 15m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/%2B/master/fiddlek/PROD.md#fiddler_pods'

	- alert: NamedFiddlesFailing
	expr: named_fiddles_total_invalid > 0
	for: 15m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Some named fiddles are failing. Visit https://named-fiddles.skia.org to see which ones.'

	- alert: FiddlerPodCommunicationErrors
	expr: rate(run_exhaustion[20m]) * 20 * 60 > 5
	for: 5m
	labels:
	category: infra
	severity: warning
	owner: jcgregorio@google.com
	annotations:
	description: 'Fiddle is having trouble communicating with fiddler pods.'

	# CQ Watcher
	- alert: TooManyCLsInCQ
	expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10
	for: 5m
	labels:
	category: infra
	severity: warning
	annotations:
	description: 'There are 10 CLs or more in a Skia CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls'

	# Update CQ_TRYBOT_DURATION_SECS_THRESHOLD in go/cq/cq.go if the number below is changed.
	- alert: CQTrybotRunningTooLong
	expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700
	labels:
	category: infra
	severity: warning
	abbr_owner_regex: rmistry@google.com:Build-.*-Android_Framework
	annotations:
	abbr: '{{ $labels.trybot }}'
	description: '{{ $labels.trybot }} ran longer than 45 mins. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold Direct link to logs: https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcq-watcher&filters=text:CQTrybotDurationError'

	# Update CQ_TRYBOTS_COUNT_THRESHOLD in go/cq/cq.go if the number below is changed.
	- alert: TooManyCQTrybotsForCL
	expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 50
	labels:
	category: infra
	severity: warning
	annotations:
	description: 'There are more than 50 CQ trybots triggered by at least one CL. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered Direct link to logs: https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcq-watcher&filters=text:CQCLsCountError'

	# Fuzzer
	- alert: FuzzerUploadQueueFull
	expr: fuzzer_queue_size_upload > 90
	for: 2m
	labels:
	category: infra
	severity: critical
	owner: kjlubick@google.com
	annotations:
	abbr: '{{ $labels.kubernetes_pod_name }}'
	description: 'Fuzzer upload queue has been very full on {{ $labels.kubernetes_pod_name }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload'

	- alert: FuzzerVersionStale
	expr: fuzzer_version_age{type="current"}/60/60/24 > 10
	labels:
	category: infra
	severity: warning
	owner: kjlubick@google.com
	annotations:
	abbr: '{{ $labels.kubernetes_pod_name }}'
	description: 'The Fuzzer on {{ $labels.kubernetes_pod_name }} hasnt rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version'

	- alert: FuzzerSlowRoll
	expr: fuzzer_version_age{type="pending"}/60/60 > 2
	labels:
	category: infra
	severity: critical
	owner: kjlubick@google.com
	annotations:
	abbr: '{{ $labels.kubernetes_pod_name }}'
	description: 'The fuzzer on {{ $labels.kubernetes_pod_name }} hasnt finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll'

	- alert: FuzzerAnalysisQueueFull
	expr: fuzzer_queue_size_analysis > 900000
	for: 2m
	labels:
	category: infra
	severity: critical
	owner: kjlubick@google.com
	annotations:
	abbr: '{{ $labels.kubernetes_pod_name }}'
	description: 'Fuzzer analysis queue has been very full on {{ $labels.kubernetes_pod_name }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis'

	# datatore backups
	- alert: BackupNotDone
	expr: liveness_backup_success_s/60/60 > 25
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: skia-public
	description: 'A backup of Cloud Datastore has not succeeded in the last 25 hours. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done'

	# alert-to-pubsub liveness
	- alert: AlertToPubSubLiveness
	expr: (min(liveness_alive_s) by (location)) > 90
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'alert-to-pubsub has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/master/am/PROD.md#alert_to_pubsub'

	# CT
	# TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public.
	- alert: CTPollerHealthCheckFailed
	expr: healthy{app="ct-master"} != 1
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check'

	- alert: CTFEPendingTaskCount
	expr: num_pending_tasks{app="ctfe"} >= 10
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'

	- alert: CTFEPendingTaskNotRunning
	expr: oldest_pending_task_status{app="ctfe"} >= 2
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'

	- alert: AutoRollLatency
	expr: prober{type="latency",probename="autoroll"} > 200
	for: 10m
	labels:
	category: infra
	severity: critical
	owner: borenet@google.com
	annotations:
	description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency'

	- alert: AutoRollFrontendErrorRate
	expr: rate(num_log_lines{level="ERROR",app=~"autoroll-fe.*"}[1h]) > 0.001
	labels:
	category: infra
	severity: critical
	owner: borenet@google.com
	annotations:
	abbr: '{{ $labels.app }}'
	description: 'The error rate for autoroll on {{ $labels.app }} is too high.
	https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
	https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'


	- alert: FlutterLicenseScriptFailure
	expr: flutter_license_script_failure{app="autoroll-be-skia-flutter-autoroll"} > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	description: 'The License scripts in the Skia->Flutter roller have failed.
	https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
	https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#flutter_license_script_failure'

	# skia-flutter-autoroll takes a long time to transition because its pre-upload
	# scripts run flutter's license script which can take around 40 minutes.
	- alert: AutoRollLastTransition
	expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 50*60
	labels:
	category: infra
	severity: critical
	owner: rmistry@google.com
	annotations:
	abbr: 'skia-flutter-autoroll'
	description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 50 minutes.
	https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
	'

	# Perf
	- alert: AndroidIngestFailures
	expr: rate(process_failures[1h]) > 0
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures'

	- alert: AndroidTxLogFailures
	expr: tx_log_write_failure > 0
	labels:
	category: infra
	severity: critical
	owner: jcgregorio@google.com
	annotations:
	description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#tx_log'

	# Prober
	- alert: ProbeFailure
	expr: prober{type="failure"} > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.probename }} {{ $labels.url }}'
	description: 'Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to
	respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{
	$labels.probename }}+filename%3Aprobersk.json5 for the endpoint URL.'

	- alert: ProberLiveness
	expr: liveness_probes_s > 300
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	description: 'The prober has failed to probe in the last 5 minutes.'