prometheus/sys/alert.rules - buildbot - Git at Google

 # General
 ALERT InstanceDown
   IF up == 0
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "Instance {{ $labels.instance }} down",
     description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
   }

 ALERT TooManyGoRoutines
   IF go_goroutines > 3000
   FOR 2m
   LABELS { category = "infra", severity = "warning"}
   ANNOTATIONS {
     summary = "Too many Go routines in {{ $labels.job }}",
     description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}."
   }

 ALERT ProbeFailure
   IF prober{type="failure"} > 0
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "Probe failed {{ $labels.probename }}",
     description = "Endpoint {{ $labels.probename }} has failed to respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ $labels.probename }}+filename%3Aprobers.json for the endpoint URL."
   }

 ALERT RebootRequired
   IF reboot_required_i > 0
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Instance {{ $labels.host }} needs rebooting. Owner(s): {{ $labels.owners }}. See https://mon.skia.org/dashboard/db/reboots-required for the full list of instances that need rebooting.",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource="df-root",host!~".*rpi-.+"} < 1e9
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Low Root Disk Space on {{ $labels.host }}.",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource=~"df-mnt-pd0|df-mnt-dds0"} < 1e10
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Low Disk Space on {{ $labels.host }} on disk {{ $labels.resource }}.",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource="df-b",host!~".*rpi-.+"} < 2e10
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Free space has fallen below 20GB on {{ $labels.host }} drive {{ $labels.resource }}.\n\nTry running:\ngo run scripts/run_on_swarming_bots/run_on_swarming_bots.go --alsologtostderr --script=scripts/run_on_swarming_bots/delete_out_dirs.py --dimension id:{{ $labels.host }}",
   }

 ALERT DiskSpaceLow
   IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-rpi-.+"} < 1e8
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}.",
   }

 # CT
 ALERT CTPollerHealthCheck
   IF healthy{instance="skia-ct-master:20000",job="ct-poller"} != 1
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CT poller healthy check failed.",
     description = "CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check."
   }

 ALERT CTFEPendingTaskCount
   IF num_pending_tasks{instance="skia-ctfe:20000",job="ctfe"} >= 10
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CTFE pending task count too high.",
     description = "There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks"
   }

 ALERT CTFEPendingTaskStatus
   IF oldest_pending_task_status{instance="skia-ctfe:20000",job="ctfe"} >= 2
   FOR 5m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CTFE pending task not running.",
     description = "A task has been waiting to be executed for a while and it's still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks"
   }

 ALERT CTFEErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="ctfe"}[2m]) > 0.1
   FOR 2m
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     summary = "CTFE error rate too high.",
     description = "The error rate for CTFE is too high. See cloud logging for skia-ctfe."
   }

 # CQ Watcher
 ALERT CQWatcherCLsCount
   IF cq_watcher_in_flight_waiting_in_cq{instance="skia-cq-watcher:20000",job="cq_watcher"} >= 10
   FOR 5m
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     summary = "Too many CLs in CQ.",
     description = "There are 10 CLs or more in Skia's CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls"
   }

 ALERT CQWatcherTrybotDuration
   IF max_over_time(cq_watcher_in_flight_trybot_duration{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 2700
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     summary = "CQ trybot running for too long.",
     description = "{{ $labels.trybot }} ran longer than 45 mins. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold"
   }

 ALERT CQWatcherTrybotsCount
   IF max_over_time(cq_watcher_in_flight_trybot_num{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 35
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     summary = "Too many CQ trybots triggered by CL.",
     description = "There are more than 35 CQ trybots triggered by at least one CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered"
   }

 # Perf
 ALERT PerfUntriagedClusters
   IF perf_clustering_untriaged{instance="skia-perf:20000"} > 0
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     summary = "One or more untriaged clusters.",
     description = "At least one untriaged perf cluster has been found. Please visit https://perf.skia.org/t/ to triage."
   }

 ALERT AndroidPerfUntriagedClusters
   IF perf_clustering_untriaged{instance="skia-android-perf:20000"} > 0
   LABELS { category = "general", severity = "warning", specialroute = "android" }
   ANNOTATIONS {
     summary = "One or more untriaged clusters.",
     description = "At least one untriaged perf cluster has been found. Please visit https://android-perf.skia.org/t/ to triage."
   }

 ALERT AndroidIngestFailures
   IF rate(process_failures[2m]) > 0.1
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     description = "Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures."
   }

 ALERT PerfIngestErrorTooHigh
   IF rate(ingestion{metric="errors"}[5m]) > 1
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     description = "Perf ingestion error rate too high. See https://prom.skia.org/graph?g0.range_input=1h&g0.expr=rate(ingestion%7Bmetric%3D%22errors%22%7D%5B5m%5D)&g0.tab=0"
   }

 # For fiddle, debugger, and imageinfo.

 ALERT BuildsFailed
   IF builds_failed >= 2
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Build for {{ $labels.job }} has failed for the last 2 chrome DEPS rolls. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#build_fail"
   }

 ALERT SyncFailed
   IF repo_sync_failed >= 2
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Sync for {{ $labels.job }} has failed 2 times in a row. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#sync_fail"
   }

 ALERT NamedFiddles
   IF named_failures > 0
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "See https://fiddle.skia.org/f/ and https://skia.googlesource.com/buildbot/%2B/master/fiddle/PROD.md#named_fail"
   }

 # datahopper_internal

 ALERT Google3AutorollStalled
   IF datahopper_internal_ingest_build_webhook_oldest_untested_commit_age/60/60 > 3
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "Commit has not been picked up by Google3-Autoroller for over three hours. https://sites.google.com/a/google.com/skia-infrastructure/docs/google3-autoroller."
   }

 ALERT DatahopperInternalUpdateWebhookMetrics
   IF liveness_ingest_build_webhook_oldest_untested_commit_age_metric_s/60 > 10
   LABELS { category = "infra", severity = "warning" }
   ANNOTATIONS {
     description = "datahopper_internal goroutine for updateWebhookMetrics is dead or failing."
   }

 ALERT TooManyGoRoutines
   IF go_goroutines{instance="skia-internal:20000",job="datahopper_internal"} > 100
   FOR 2m
   LABELS { category = "infra", severity = "warning"}
   ANNOTATIONS {
     description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}."
   }

 # Datahopper

 ALERT DatahopperErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="datahopper"}[10m]) > 5
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-datahopper2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fdatahopper"
   }

 # Swarming

 ALERT BotMissing
   IF swarming_bots_last_seen/1024/1024/1024/60 > 15
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Swarming bot {{ $labels.bot }} is missing. https://chromium-swarm.appspot.com/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
   }

 ALERT BotQuarantined
   IF avg_over_time(swarming_bots_quarantined[10m]) >= 1
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Swarming bot {{ $labels.bot }} is quarantined. https://chromium-swarm.appspot.com/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
   }

 # Fuzzer

 ALERT FuzzerUploadQueue
   IF fuzzer_queue_size_upload > 90
   FOR 2m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Fuzzer upload queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload"
   }

 ALERT FuzzerAnalysisQueue
   IF fuzzer_queue_size_analysis > 9000
   FOR 2m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Fuzzer analysis queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis"
   }

 ALERT FuzzerNonUseful
   IF max_over_time(fuzzer_stats_paths_total[1h]) < 20
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The {{ $labels.fuzz_category }} generator hasn't made much progress fuzzing on {{ $labels.host }}. Perhaps it is misconfigured? https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#useless_fuzzer"
   }

 ALERT FuzzerStaleVersion
   IF fuzzer_version_age{type="current"}/60/60/24 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The Fuzzer hasn't rolled its version forward in 10 days.  Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version"
   }

 ALERT FuzzerSlowRoll
   IF fuzzer_version_age{type="pending"}/60/60 > 2
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The fuzzer hasn't finished rolling its version forward in 2 hours.  Something might be wrong.  https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll"
   }

 # Status

 ALERT StatusLatency
   IF avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024  > 2
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 1s to respond."
   }

 # Prober

 ALERT ProberLiveness
   IF liveness_probes_s/60 > 10
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Probing has failed to run in at least 10 minutes."
   }

 ALERT IssueTrackerLiveness
   IF liveness_issue_tracker_s/60 > 30
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "Issue tracker ingestion has failed to run in at least 30 minutes."
   }

 # Autoroller

 ALERT SkiaAutoRoll
   IF autoroll_last_roll_result{child_path="src/third_party/skia"} == 0
   FOR 10m
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     description = "The last DEPS roll attempt for Skia failed. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#autoroll_failed."
   }

 ALERT SkiaAutoRoll24H
   IF liveness_last_autoroll_landed_s{child_path="src/third_party/skia"}/60/60 > 24
   LABELS { category = "general", severity = "warning" }
   ANNOTATIONS {
     description = "The last-landed AutoRoll for Skia was over 24h ago. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#no_rolls_24h."
   }

 ALERT AutoRollLatency
   IF prober{type="latency",probename=~".*autoroll"} > 200
   FOR 10m
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The endpoint for {{ $labels.probename }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency."
   }

 ALERT AutoRollErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="autoroll"}[10m]) > 5
   LABELS { category = "infra", severity = "critical" }
   ANNOTATIONS {
     description = "The error rate for autoroll on {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate."
   }
 # Gold

 ALERT GoldIngestionStalled
   IF liveness_gold_s{metric="since-last-run",source="poll"} > 750
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/ingestion.",
   }

 ALERT GoldIngestionErrorRate
   IF rate(num_log_lines{level="ERROR",job="ingestion"}[2m]) > 1
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See See https://mon.skia.org/dashboard/db/ingestion.",
   }
	# General
	ALERT InstanceDown
	IF up == 0
	FOR 5m
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	summary = "Instance {{ $labels.instance }} down",
	description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
	}

	ALERT TooManyGoRoutines
	IF go_goroutines > 3000
	FOR 2m
	LABELS { category = "infra", severity = "warning"}
	ANNOTATIONS {
	summary = "Too many Go routines in {{ $labels.job }}",
	description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}."
	}

	ALERT ProbeFailure
	IF prober{type="failure"} > 0
	FOR 5m
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	summary = "Probe failed {{ $labels.probename }}",
	description = "Endpoint {{ $labels.probename }} has failed to respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ $labels.probename }}+filename%3Aprobers.json for the endpoint URL."
	}

	ALERT RebootRequired
	IF reboot_required_i > 0
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Instance {{ $labels.host }} needs rebooting. Owner(s): {{ $labels.owners }}. See https://mon.skia.org/dashboard/db/reboots-required for the full list of instances that need rebooting.",
	}

	ALERT DiskSpaceLow
	IF df_complex_free{resource="df-root",host!~".*rpi-.+"} < 1e9
	FOR 5m
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Low Root Disk Space on {{ $labels.host }}.",
	}

	ALERT DiskSpaceLow
	IF df_complex_free{resource=~"df-mnt-pd0\|df-mnt-dds0"} < 1e10
	FOR 5m
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Low Disk Space on {{ $labels.host }} on disk {{ $labels.resource }}.",
	}

	ALERT DiskSpaceLow
	IF df_complex_free{resource="df-b",host!~".*rpi-.+"} < 2e10
	FOR 5m
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Free space has fallen below 20GB on {{ $labels.host }} drive {{ $labels.resource }}.\n\nTry running:\ngo run scripts/run_on_swarming_bots/run_on_swarming_bots.go --alsologtostderr --script=scripts/run_on_swarming_bots/delete_out_dirs.py --dimension id:{{ $labels.host }}",
	}

	ALERT DiskSpaceLow
	IF df_complex_free{resource=~"df-var\|df-tmp",host=~"skia-rpi-.+"} < 1e8
	FOR 5m
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}.",
	}

	# CT
	ALERT CTPollerHealthCheck
	IF healthy{instance="skia-ct-master:20000",job="ct-poller"} != 1
	FOR 5m
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	summary = "CT poller healthy check failed.",
	description = "CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check."
	}

	ALERT CTFEPendingTaskCount
	IF num_pending_tasks{instance="skia-ctfe:20000",job="ctfe"} >= 10
	FOR 5m
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	summary = "CTFE pending task count too high.",
	description = "There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks"
	}

	ALERT CTFEPendingTaskStatus
	IF oldest_pending_task_status{instance="skia-ctfe:20000",job="ctfe"} >= 2
	FOR 5m
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	summary = "CTFE pending task not running.",
	description = "A task has been waiting to be executed for a while and it's still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks"
	}

	ALERT CTFEErrorRate
	IF rate(num_log_lines{level="ERROR",log_source="ctfe"}[2m]) > 0.1
	FOR 2m
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	summary = "CTFE error rate too high.",
	description = "The error rate for CTFE is too high. See cloud logging for skia-ctfe."
	}

	# CQ Watcher
	ALERT CQWatcherCLsCount
	IF cq_watcher_in_flight_waiting_in_cq{instance="skia-cq-watcher:20000",job="cq_watcher"} >= 10
	FOR 5m
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	summary = "Too many CLs in CQ.",
	description = "There are 10 CLs or more in Skia's CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls"
	}

	ALERT CQWatcherTrybotDuration
	IF max_over_time(cq_watcher_in_flight_trybot_duration{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 2700
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	summary = "CQ trybot running for too long.",
	description = "{{ $labels.trybot }} ran longer than 45 mins. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold"
	}

	ALERT CQWatcherTrybotsCount
	IF max_over_time(cq_watcher_in_flight_trybot_num{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 35
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	summary = "Too many CQ trybots triggered by CL.",
	description = "There are more than 35 CQ trybots triggered by at least one CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered"
	}

	# Perf
	ALERT PerfUntriagedClusters
	IF perf_clustering_untriaged{instance="skia-perf:20000"} > 0
	LABELS { category = "general", severity = "warning" }
	ANNOTATIONS {
	summary = "One or more untriaged clusters.",
	description = "At least one untriaged perf cluster has been found. Please visit https://perf.skia.org/t/ to triage."
	}

	ALERT AndroidPerfUntriagedClusters
	IF perf_clustering_untriaged{instance="skia-android-perf:20000"} > 0
	LABELS { category = "general", severity = "warning", specialroute = "android" }
	ANNOTATIONS {
	summary = "One or more untriaged clusters.",
	description = "At least one untriaged perf cluster has been found. Please visit https://android-perf.skia.org/t/ to triage."
	}

	ALERT AndroidIngestFailures
	IF rate(process_failures[2m]) > 0.1
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	description = "Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures."
	}

	ALERT PerfIngestErrorTooHigh
	IF rate(ingestion{metric="errors"}[5m]) > 1
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	description = "Perf ingestion error rate too high. See https://prom.skia.org/graph?g0.range_input=1h&g0.expr=rate(ingestion%7Bmetric%3D%22errors%22%7D%5B5m%5D)&g0.tab=0"
	}

	# For fiddle, debugger, and imageinfo.

	ALERT BuildsFailed
	IF builds_failed >= 2
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Build for {{ $labels.job }} has failed for the last 2 chrome DEPS rolls. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#build_fail"
	}

	ALERT SyncFailed
	IF repo_sync_failed >= 2
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Sync for {{ $labels.job }} has failed 2 times in a row. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#sync_fail"
	}

	ALERT NamedFiddles
	IF named_failures > 0
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "See https://fiddle.skia.org/f/ and https://skia.googlesource.com/buildbot/%2B/master/fiddle/PROD.md#named_fail"
	}

	# datahopper_internal

	ALERT Google3AutorollStalled
	IF datahopper_internal_ingest_build_webhook_oldest_untested_commit_age/60/60 > 3
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "Commit has not been picked up by Google3-Autoroller for over three hours. https://sites.google.com/a/google.com/skia-infrastructure/docs/google3-autoroller."
	}

	ALERT DatahopperInternalUpdateWebhookMetrics
	IF liveness_ingest_build_webhook_oldest_untested_commit_age_metric_s/60 > 10
	LABELS { category = "infra", severity = "warning" }
	ANNOTATIONS {
	description = "datahopper_internal goroutine for updateWebhookMetrics is dead or failing."
	}

	ALERT TooManyGoRoutines
	IF go_goroutines{instance="skia-internal:20000",job="datahopper_internal"} > 100
	FOR 2m
	LABELS { category = "infra", severity = "warning"}
	ANNOTATIONS {
	description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}."
	}

	# Datahopper

	ALERT DatahopperErrorRate
	IF rate(num_log_lines{level="ERROR",log_source="datahopper"}[10m]) > 5
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-datahopper2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fdatahopper"
	}

	# Swarming

	ALERT BotMissing
	IF swarming_bots_last_seen/1024/1024/1024/60 > 15
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "Swarming bot {{ $labels.bot }} is missing. https://chromium-swarm.appspot.com/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
	}

	ALERT BotQuarantined
	IF avg_over_time(swarming_bots_quarantined[10m]) >= 1
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "Swarming bot {{ $labels.bot }} is quarantined. https://chromium-swarm.appspot.com/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
	}

	# Fuzzer

	ALERT FuzzerUploadQueue
	IF fuzzer_queue_size_upload > 90
	FOR 2m
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "Fuzzer upload queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload"
	}

	ALERT FuzzerAnalysisQueue
	IF fuzzer_queue_size_analysis > 9000
	FOR 2m
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "Fuzzer analysis queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis"
	}

	ALERT FuzzerNonUseful
	IF max_over_time(fuzzer_stats_paths_total[1h]) < 20
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The {{ $labels.fuzz_category }} generator hasn't made much progress fuzzing on {{ $labels.host }}. Perhaps it is misconfigured? https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#useless_fuzzer"
	}

	ALERT FuzzerStaleVersion
	IF fuzzer_version_age{type="current"}/60/60/24 > 10
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The Fuzzer hasn't rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version"
	}

	ALERT FuzzerSlowRoll
	IF fuzzer_version_age{type="pending"}/60/60 > 2
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The fuzzer hasn't finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll"
	}

	# Status

	ALERT StatusLatency
	IF avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024 > 2
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 1s to respond."
	}

	# Prober

	ALERT ProberLiveness
	IF liveness_probes_s/60 > 10
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "Probing has failed to run in at least 10 minutes."
	}

	ALERT IssueTrackerLiveness
	IF liveness_issue_tracker_s/60 > 30
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "Issue tracker ingestion has failed to run in at least 30 minutes."
	}

	# Autoroller

	ALERT SkiaAutoRoll
	IF autoroll_last_roll_result{child_path="src/third_party/skia"} == 0
	FOR 10m
	LABELS { category = "general", severity = "warning" }
	ANNOTATIONS {
	description = "The last DEPS roll attempt for Skia failed. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#autoroll_failed."
	}

	ALERT SkiaAutoRoll24H
	IF liveness_last_autoroll_landed_s{child_path="src/third_party/skia"}/60/60 > 24
	LABELS { category = "general", severity = "warning" }
	ANNOTATIONS {
	description = "The last-landed AutoRoll for Skia was over 24h ago. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#no_rolls_24h."
	}

	ALERT AutoRollLatency
	IF prober{type="latency",probename=~".*autoroll"} > 200
	FOR 10m
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The endpoint for {{ $labels.probename }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency."
	}

	ALERT AutoRollErrorRate
	IF rate(num_log_lines{level="ERROR",log_source="autoroll"}[10m]) > 5
	LABELS { category = "infra", severity = "critical" }
	ANNOTATIONS {
	description = "The error rate for autoroll on {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate."
	}
	# Gold

	ALERT GoldIngestionStalled
	IF liveness_gold_s{metric="since-last-run",source="poll"} > 750
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/ingestion.",
	}

	ALERT GoldIngestionErrorRate
	IF rate(num_log_lines{level="ERROR",job="ingestion"}[2m]) > 1
	LABELS { category = "infra", severity = "critical"}
	ANNOTATIONS {
	description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See See https://mon.skia.org/dashboard/db/ingestion.",
	}