| # General |
| ALERT InstanceDown |
| IF up == 0 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| summary = "Instance {{ $labels.instance }} down", |
| description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.", |
| } |
| |
| ALERT TooManyGoRoutines |
| IF go_goroutines > 3000 |
| FOR 2m |
| LABELS { category = "infra", severity = "warning"} |
| ANNOTATIONS { |
| summary = "Too many Go routines in {{ $labels.job }}", |
| description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}." |
| } |
| |
| ALERT ProbeFailure |
| IF prober{type="failure"} > 0 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| summary = "Probe failed {{ $labels.probename }}", |
| description = "Endpoint {{ $labels.probename }} has failed to respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ $labels.probename }}+filename%3Aprobers.json for the endpoint URL." |
| } |
| |
| ALERT RebootRequired |
| IF reboot_required_i > 0 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Instance {{ $labels.host }} needs rebooting. Owner(s): {{ $labels.owners }}. See https://mon.skia.org/dashboard/db/reboots-required for the full list of instances that need rebooting.", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-root",host!~".*rpi-.+"} < 1e9 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Low Root Disk Space on {{ $labels.host }}.", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-mnt-pd0|df-mnt-dds0"} < 1e10 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Low Disk Space on {{ $labels.host }} on disk {{ $labels.resource }}.", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-b",host!~".*rpi-.+"} < 2e10 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Free space has fallen below 20GB on {{ $labels.host }} drive {{ $labels.resource }}.\n\nTry running:\ngo run scripts/run_on_swarming_bots/run_on_swarming_bots.go --alsologtostderr --script=scripts/run_on_swarming_bots/delete_out_dirs.py --dimension id:{{ $labels.host }}", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-rpi-.+"} < 1e8 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}.", |
| } |
| |
| # CT |
| ALERT CTPollerHealthCheck |
| IF healthy{instance="skia-ct-master:20000",job="ct-poller"} != 1 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| summary = "CT poller healthy check failed.", |
| description = "CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check." |
| } |
| |
| ALERT CTFEPendingTaskCount |
| IF num_pending_tasks{instance="skia-ctfe:20000",job="ctfe"} >= 10 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| summary = "CTFE pending task count too high.", |
| description = "There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks" |
| } |
| |
| ALERT CTFEPendingTaskStatus |
| IF oldest_pending_task_status{instance="skia-ctfe:20000",job="ctfe"} >= 2 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| summary = "CTFE pending task not running.", |
| description = "A task has been waiting to be executed for a while and it's still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks" |
| } |
| |
| ALERT CTFEErrorRate |
| IF rate(num_log_lines{level="ERROR",log_source="ctfe"}[2m]) > 0.1 |
| FOR 2m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| summary = "CTFE error rate too high.", |
| description = "The error rate for CTFE is too high. See cloud logging for skia-ctfe." |
| } |
| |
| # CQ Watcher |
| ALERT CQWatcherCLsCount |
| IF cq_watcher_in_flight_waiting_in_cq{instance="skia-cq-watcher:20000",job="cq_watcher"} >= 10 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| summary = "Too many CLs in CQ.", |
| description = "There are 10 CLs or more in Skia's CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls" |
| } |
| |
| ALERT CQWatcherTrybotDuration |
| IF max_over_time(cq_watcher_in_flight_trybot_duration{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 2700 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| summary = "CQ trybot running for too long.", |
| description = "{{ $labels.trybot }} ran longer than 45 mins. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold" |
| } |
| |
| ALERT CQWatcherTrybotsCount |
| IF max_over_time(cq_watcher_in_flight_trybot_num{instance="skia-cq-watcher:20000",job="cq_watcher"}[20m]) > 35 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| summary = "Too many CQ trybots triggered by CL.", |
| description = "There are more than 35 CQ trybots triggered by at least one CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered" |
| } |
| |
| # Perf |
| ALERT PerfUntriagedClusters |
| IF perf_clustering_untriaged{instance="skia-perf:20000"} > 0 |
| LABELS { category = "general", severity = "warning" } |
| ANNOTATIONS { |
| summary = "One or more untriaged clusters.", |
| description = "At least one untriaged perf cluster has been found. Please visit https://perf.skia.org/t/ to triage." |
| } |
| |
| ALERT AndroidPerfUntriagedClusters |
| IF perf_clustering_untriaged{instance="skia-android-perf:20000"} > 0 |
| LABELS { category = "general", severity = "warning", specialroute = "android" } |
| ANNOTATIONS { |
| summary = "One or more untriaged clusters.", |
| description = "At least one untriaged perf cluster has been found. Please visit https://android-perf.skia.org/t/ to triage." |
| } |
| |
| ALERT AndroidIngestFailures |
| IF rate(process_failures[2m]) > 0.1 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures." |
| } |
| |
| ALERT PerfIngestErrorTooHigh |
| IF rate(ingestion{metric="errors"}[5m]) > 1 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "Perf ingestion error rate too high. See https://prom.skia.org/graph?g0.range_input=1h&g0.expr=rate(ingestion%7Bmetric%3D%22errors%22%7D%5B5m%5D)&g0.tab=0" |
| } |
| |
| # For fiddle, debugger, and imageinfo. |
| |
| ALERT BuildsFailed |
| IF builds_failed >= 2 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Build for {{ $labels.job }} has failed for the last 2 chrome DEPS rolls. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#build_fail" |
| } |
| |
| ALERT SyncFailed |
| IF repo_sync_failed >= 2 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Sync for {{ $labels.job }} has failed 2 times in a row. https://skia.googlesource.com/buildbot/%2B/master/{{ $labels.job }}/PROD.md#sync_fail" |
| } |
| |
| ALERT NamedFiddles |
| IF named_failures > 0 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "See https://fiddle.skia.org/f/ and https://skia.googlesource.com/buildbot/%2B/master/fiddle/PROD.md#named_fail" |
| } |
| |
| # datahopper_internal |
| |
| ALERT Google3AutorollStalled |
| IF datahopper_internal_ingest_build_webhook_oldest_untested_commit_age/60/60 > 3 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "Commit has not been picked up by Google3-Autoroller for over three hours. https://sites.google.com/a/google.com/skia-infrastructure/docs/google3-autoroller." |
| } |
| |
| ALERT DatahopperInternalUpdateWebhookMetrics |
| IF liveness_ingest_build_webhook_oldest_untested_commit_age_metric_s/60 > 10 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "datahopper_internal goroutine for updateWebhookMetrics is dead or failing." |
| } |
| |
| ALERT TooManyGoRoutines |
| IF go_goroutines{instance="skia-internal:20000",job="datahopper_internal"} > 100 |
| FOR 2m |
| LABELS { category = "infra", severity = "warning"} |
| ANNOTATIONS { |
| description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}." |
| } |
| |
| # Datahopper |
| |
| ALERT DatahopperErrorRate |
| IF rate(num_log_lines{level="ERROR",log_source="datahopper"}[10m]) > 5 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-datahopper2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fdatahopper" |
| } |
| |
| # Swarming |
| |
| ALERT BotMissing |
| IF swarming_bots_last_seen/1024/1024/1024/60 > 15 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Swarming bot {{ $labels.bot }} is missing. https://chromium-swarm.appspot.com/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| ALERT BotQuarantined |
| IF avg_over_time(swarming_bots_quarantined[10m]) >= 1 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Swarming bot {{ $labels.bot }} is quarantined. https://chromium-swarm.appspot.com/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| # Fuzzer |
| |
| ALERT FuzzerUploadQueue |
| IF fuzzer_queue_size_upload > 90 |
| FOR 2m |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Fuzzer upload queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload" |
| } |
| |
| ALERT FuzzerAnalysisQueue |
| IF fuzzer_queue_size_analysis > 9000 |
| FOR 2m |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Fuzzer analysis queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis" |
| } |
| |
| ALERT FuzzerNonUseful |
| IF max_over_time(fuzzer_stats_paths_total[1h]) < 20 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The {{ $labels.fuzz_category }} generator hasn't made much progress fuzzing on {{ $labels.host }}. Perhaps it is misconfigured? https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#useless_fuzzer" |
| } |
| |
| ALERT FuzzerStaleVersion |
| IF fuzzer_version_age{type="current"}/60/60/24 > 10 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The Fuzzer hasn't rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version" |
| } |
| |
| ALERT FuzzerSlowRoll |
| IF fuzzer_version_age{type="pending"}/60/60 > 2 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The fuzzer hasn't finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll" |
| } |
| |
| # Status |
| |
| ALERT StatusLatency |
| IF avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024 > 2 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 1s to respond." |
| } |
| |
| # Prober |
| |
| ALERT ProberLiveness |
| IF liveness_probes_s/60 > 10 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Probing has failed to run in at least 10 minutes." |
| } |
| |
| ALERT IssueTrackerLiveness |
| IF liveness_issue_tracker_s/60 > 30 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Issue tracker ingestion has failed to run in at least 30 minutes." |
| } |
| |
| # Autoroller |
| |
| ALERT SkiaAutoRoll |
| IF autoroll_last_roll_result{child_path="src/third_party/skia"} == 0 |
| FOR 10m |
| LABELS { category = "general", severity = "warning" } |
| ANNOTATIONS { |
| description = "The last DEPS roll attempt for Skia failed. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#autoroll_failed." |
| } |
| |
| ALERT SkiaAutoRoll24H |
| IF liveness_last_autoroll_landed_s{child_path="src/third_party/skia"}/60/60 > 24 |
| LABELS { category = "general", severity = "warning" } |
| ANNOTATIONS { |
| description = "The last-landed AutoRoll for Skia was over 24h ago. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#no_rolls_24h." |
| } |
| |
| ALERT AutoRollLatency |
| IF prober{type="latency",probename=~".*autoroll"} > 200 |
| FOR 10m |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The endpoint for {{ $labels.probename }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency." |
| } |
| |
| ALERT AutoRollErrorRate |
| IF rate(num_log_lines{level="ERROR",log_source="autoroll"}[10m]) > 5 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "The error rate for autoroll on {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate." |
| } |
| # Gold |
| |
| ALERT GoldIngestionStalled |
| IF liveness_gold_s{metric="since-last-run",source="poll"} > 750 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/ingestion.", |
| } |
| |
| ALERT GoldIngestionErrorRate |
| IF rate(num_log_lines{level="ERROR",job="ingestion"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See See https://mon.skia.org/dashboard/db/ingestion.", |
| } |
| |