blob: 970bcd8263bcc40b1c75ed6be61be14eaa704946 [file] [log] [blame]
# Alerts for things in the skia-public cluster only.
#
# If anything in this file starts to run in another cluster, such as
# skia-corp, then break it out into its own alerts_NNNN.yml file
# and include it in each prometheus-CLUSTER.yml file that is it running in.
groups:
- name: general
rules:
# This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs
# for processes with this metric.
- alert: CrashLoop
expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.instance }}'
description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on
startup. Logs:
kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
'
# Container Builder
- alert: ContainerBuilderFailure
expr: ci_build_failure >= 2
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.trigger }}'
description: 'The build with trigger name {{ $labels.trigger }} has failed when rebuilding twice in a row.'
# Continuous Deploy
- alert: ContinuousDeployLiveness
expr: liveness_ci_pubsub_receive_s > 60 * 60 * 24 * 2
for: 5m
labels:
category: infra
severity: critical
annotations:
description: 'Continuous deploy has failed to recieve a pubsub event in the last 48 hours.
https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcontinuous-deploy&minLogLevel=400
'
- alert: ContinuousDeployFailures
expr: ci_push_failure > 2
for: 5m
labels:
category: infra
severity: critical
annotations:
description: 'Continuous deploy has failed to successfully push two or more times in a row.
https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcontinuous-deploy&minLogLevel=400
'
# Fiddle
- alert: InsufficientFiddlerPods
expr: avg_over_time(pods_idle[2m]) < 5
for: 15m
labels:
category: infra
severity: warning
owner: jcgregorio@google.com
annotations:
description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/%2B/master/fiddlek/PROD.md#fiddler_pods'
- alert: NamedFiddlesFailing
expr: named_fiddles_total_invalid > 0
for: 15m
labels:
category: infra
severity: warning
owner: jcgregorio@google.com
annotations:
description: 'Some named fiddles are failing. Visit https://named-fiddles.skia.org to see which ones.'
- alert: FiddlerPodCommunicationErrors
expr: rate(run_exhaustion[20m]) * 20 * 60 > 5
for: 5m
labels:
category: infra
severity: warning
owner: jcgregorio@google.com
annotations:
description: 'Fiddle is having trouble communicating with fiddler pods.'
# CQ Watcher
- alert: TooManyCLsInCQ
expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10
for: 5m
labels:
category: infra
severity: warning
annotations:
description: 'There are 10 CLs or more in a Skia CL. https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls'
# Update CQ_TRYBOT_DURATION_SECS_THRESHOLD in go/cq/cq.go if the number below is changed.
- alert: CQTrybotRunningTooLong
expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700
labels:
category: infra
severity: warning
abbr_owner_regex: rmistry@google.com:Build-.*-Android_Framework
annotations:
abbr: '{{ $labels.trybot }}'
description: '{{ $labels.trybot }} ran longer than 45 mins. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold Direct link to logs: https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcq-watcher&filters=text:CQTrybotDurationError'
# Update CQ_TRYBOTS_COUNT_THRESHOLD in go/cq/cq.go if the number below is changed.
- alert: TooManyCQTrybotsForCL
expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 50
labels:
category: infra
severity: warning
annotations:
description: 'There are more than 50 CQ trybots triggered by at least one CL. Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered Direct link to logs: https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fcq-watcher&filters=text:CQCLsCountError'
# Fuzzer
- alert: FuzzerUploadQueueFull
expr: fuzzer_queue_size_upload > 90
for: 2m
labels:
category: infra
severity: critical
owner: kjlubick@google.com
annotations:
abbr: '{{ $labels.kubernetes_pod_name }}'
description: 'Fuzzer upload queue has been very full on {{ $labels.kubernetes_pod_name }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload'
- alert: FuzzerVersionStale
expr: fuzzer_version_age{type="current"}/60/60/24 > 10
labels:
category: infra
severity: warning
owner: kjlubick@google.com
annotations:
abbr: '{{ $labels.kubernetes_pod_name }}'
description: 'The Fuzzer on {{ $labels.kubernetes_pod_name }} hasnt rolled its version forward in 10 days. Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version'
- alert: FuzzerSlowRoll
expr: fuzzer_version_age{type="pending"}/60/60 > 2
labels:
category: infra
severity: critical
owner: kjlubick@google.com
annotations:
abbr: '{{ $labels.kubernetes_pod_name }}'
description: 'The fuzzer on {{ $labels.kubernetes_pod_name }} hasnt finished rolling its version forward in 2 hours. Something might be wrong. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll'
- alert: FuzzerAnalysisQueueFull
expr: fuzzer_queue_size_analysis > 900000
for: 2m
labels:
category: infra
severity: critical
owner: kjlubick@google.com
annotations:
abbr: '{{ $labels.kubernetes_pod_name }}'
description: 'Fuzzer analysis queue has been very full on {{ $labels.kubernetes_pod_name }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis'
# datatore backups
- alert: BackupNotDone
expr: liveness_backup_success_s/60/60 > 25
labels:
category: infra
severity: critical
annotations:
abbr: skia-public
description: 'A backup of Cloud Datastore has not succeeded in the last 25 hours. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done'
# alert-to-pubsub liveness
- alert: AlertToPubSubLiveness
expr: (min(liveness_alive_s) by (location)) > 90
labels:
category: infra
severity: critical
owner: jcgregorio@google.com
annotations:
abbr: '{{ $labels.location }}'
description: 'alert-to-pubsub for {{ $labels.location }} has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/master/am/PROD.md#alert_to_pubsub'
# CT
# TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public.
- alert: CTPollerHealthCheckFailed
expr: healthy{app="ct-master"} != 1
for: 5m
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
description: 'CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check'
- alert: CTFEPendingTaskCount
expr: num_pending_tasks{app="ctfe"} >= 10
for: 5m
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'
- alert: CTFEPendingTaskNotRunning
expr: oldest_pending_task_status{app="ctfe"} >= 2
for: 5m
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'
- alert: AutoRollLatency
expr: prober{type="latency",probename="autoroll"} > 200
for: 10m
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency'
- alert: FlutterLicenseScriptFailure
expr: flutter_license_script_failure{app="autoroll-be-skia-flutter-autoroll"} > 0
for: 5m
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
description: 'The License scripts in the Skia->Flutter roller have failed.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#flutter_license_script_failure'
# skia-flutter-autoroll takes a long time to transition because its pre-upload
# scripts run flutter's license script which can take around 40 minutes.
- alert: AutoRollLastTransition
expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 50*60
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: 'skia-flutter-autoroll'
description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 50 minutes.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
'
# Perf
- alert: AndroidIngestFailures
expr: rate(process_failures[1h]) > 0.01
labels:
category: infra
severity: critical
owner: jcgregorio@google.com
annotations:
description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures'
- alert: AndroidTxLogFailures
expr: tx_log_write_failure > 0
labels:
category: infra
severity: critical
owner: jcgregorio@google.com
annotations:
description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#tx_log'
- alert: PerfAndroidClusteringSlow
expr: rate(perf_clustering_queries{app="skiaperf-android"}[30m])*30*60 < 10
for: 1h
labels:
category: infra
severity: critical
owner: jcgregorio@google.com
annotations:
description: 'https://android-master-perf.skia.org/t/ clustering appears to be stuck. Check the Status on the triage page.'
# Prober
- alert: ProbeFailure
expr: prober{type="failure"} > 0
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.probename }} {{ $labels.url }}'
description: 'Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to
respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{
$labels.probename }}+filename%3Aprobersk.json5 for the endpoint URL.'
- alert: ProberLiveness
expr: liveness_probes_s > 300
for: 5m
labels:
category: infra
severity: critical
annotations:
description: 'The prober has failed to probe in the last 5 minutes.'
# Grafana Backup
- alert: GrafanaBackupLiveness
expr: liveness_backup_s > 60*60*25
for: 5m
labels:
category: infra
severity: critical
annotations:
description: 'backup-to-gcs has failed to back up the Grafana db in the last 24 hours. Check the logs.'
# Skia Status
- alert: StatusLatency
expr: avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024 > 10
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
description: 'The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond.'
# Datahopper
- alert: FirestoreBackupTooOld
expr: liveness_last_successful_firestore_backup_s/60/60 > 26
labels:
category: infra
severity: critical
owner: benjaminwagner@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The most recent successful Firestore nightly backup was more than 26 hours ago. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#firestore_nightly_backup'
# Swarming
- alert: WindowsSkoloOSVersion
# The device on the rpi-01 rack is likely to be out-of-sync with the bots on
# the win-02 and win-03 racks. Therefore, only alert if there are more than
# two versions in use.
expr: windows_skolo_os_version_count{pool="Skia"} > 2
for: 24h
labels:
category: infra
severity: warning
annotations:
description: 'Windows Skolo bots OS version has diverged. https://goto.google.com/skolo-maintenance'