| # Alerts for things in the skia-public cluster only. |
| # |
| # If anything in this file starts to run in another cluster, such as |
| # skia-corp, then break it out into its own alerts_NNNN.yml file |
| # and include it in each prometheus-CLUSTER.yml file that is it running in. |
| groups: |
| - name: general |
| rules: |
| # Container Builder |
| - alert: ContainerBuilderFailure |
| expr: ci_build_failure >= 2 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'Container Builder Failed' |
| description: 'The build with trigger name {{ $labels.trigger}} has failed when rebuilding twice in a row.' |
| # Continuous Deploy |
| - alert: ContinuousDeployLiveness |
| expr: liveness_ci_pubsub_receive_s > 60 * 60 * 24 * 2 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} has failed to recieve a pubsub event in the last 48 hours. |
| |
| https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}" |
| ' |
| - alert: ContinuousDeployFailures |
| expr: ci_push_failure > 2 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} has failed to successfully push two or more times in a row. |
| |
| https://console.cloud.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D"projects%2Fskia-public%2Flogs%2F{{ $labels.job }}" |
| ' |
| # Fiddle |
| - alert: FiddlerPodsTooLow |
| expr: avg_over_time(pods_idle[2m]) < 5 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'Insufficient fiddler pods.' |
| description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/%2B/master/fiddlek/PROD.md#fiddler_pods' |
| - alert: NamedFiddlesFailing |
| expr: named_fiddles_total_invalid > 0 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'named fiddles failing' |
| description: 'Some named fiddles are failing. Visit https://named-fiddles.skia.org to see which ones.' |
| - alert: FiddleFailingToFindIdlePods |
| expr: rate(run_exhaustion[20m]) * 20 * 60 > 1 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'Fiddler pod communication errors.' |
| description: 'Fiddle is having trouble communicating with fiddler pods.' |
| |
| # CQ Watcher |
| - alert: CQWatcherCLsCount |
| expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'Too many CLs in CQ.' |
| description: 'There are 10 CLs or more in a Skia CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls' |
| - alert: CQWatcherTrybotDuration |
| expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'CQ trybot running for too long.' |
| description: '{{ $labels.trybot }} ran longer than 45 mins. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold Direct link to logs: https://pantheon.corp.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D%22projects%2Fskia-public%2Flogs%2Fcq-watcher%22%20AND%20textPayload:%20%22CQTrybotDurationError%22' |
| - alert: CQWatcherTrybotsCount |
| expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 50 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'Too many CQ trybots triggered by CL.' |
| description: 'There are more than 50 CQ trybots triggered by at least one CL. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered Direct link to logs: https://pantheon.corp.google.com/logs/viewer?project=skia-public&advancedFilter=logName%3D%22projects%2Fskia-public%2Flogs%2Fcq-watcher%22%20AND%20textPayload:%20%22CQCLsCountError%22' |
| |
| # Fuzzer |
| - alert: FuzzerUploadQueue |
| expr: fuzzer_queue_size_upload > 90 |
| for: 2m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: 'Full Upload Queue' |
| description: 'Fuzzer upload queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload' |
| - alert: FuzzerStaleVersion |
| expr: fuzzer_version_age{type="current"}/60/60/24 > 10 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: 'Fuzzer version stale' |
| description: 'The Fuzzer hasnt rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version' |
| - alert: FuzzerSlowRoll |
| expr: fuzzer_version_age{type="pending"}/60/60 > 2 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: 'Fuzzer roll taking a while' |
| description: 'The fuzzer hasnt finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll' |
| - alert: FuzzerAnalysisQueue |
| expr: fuzzer_queue_size_analysis > 900000 |
| for: 2m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: 'Full Analysis Queue' |
| description: 'Fuzzer analysis queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis' |
| |
| # datatore backups |
| - alert: BackupNotDone |
| expr: liveness_backup_success_s/60/60 > 25 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: skia-public |
| description: 'A backup of Cloud Datastore has not succeeded in the last 25 hours. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done' |
| |
| # alert-to-pubsub liveness |
| - alert: AlertToPubSubLiveness |
| expr: (min(liveness_alive_s) by (location)) > 90 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: alert-to-pubsub |
| description: 'alert-to-pubsub has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/master/am/PROD.md#alert_to_pubsub' |
| |
| # CT |
| # TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public. |
| - alert: CTPollerHealthCheck |
| expr: healthy{app="ct-master"} != 1 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: 'CT poller health check failed.' |
| description: 'CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check' |
| - alert: CTFEPendingTaskCount |
| expr: num_pending_tasks{app="ctfe"} >= 10 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: 'CTFE pending task count too high.' |
| description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks' |
| - alert: CTFEPendingTaskStatus |
| expr: oldest_pending_task_status{app="ctfe"} >= 2 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: 'CTFE pending task not running.' |
| description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks' |
| |
| - alert: AutoRollLatency |
| expr: prober{type="latency",probename="autoroll"} > 200 |
| for: 10m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency' |
| |
| - alert: AutoRollFrontendErrorRate |
| expr: rate(num_log_lines{level="ERROR",log_source="autoroll-fe"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| description: 'The error rate for autoroll on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate' |
| |
| # Perf |
| - alert: AndroidIngestFailures |
| expr: rate(process_failures[1h]) > 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures' |
| - alert: AndroidTxLogFailures |
| expr: tx_log_write_failure > 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#tx_log' |