| # Alerts for things in the skia-public cluster only. |
| # |
| # If anything in this file starts to run in another cluster, such as |
| # skia-corp, then break it out into its own alerts_NNNN.yml file |
| # and include it in each prometheus-CLUSTER.yml file that is it running in. |
| groups: |
| - name: general |
| rules: |
| |
| # This alert belongs in alerts_general.yml, except that skia-corp doesn't have any scrape_configs |
| # for processes with this metric. |
| - alert: CrashLoop |
| expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on |
| startup. Logs: |
| |
| kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }} |
| |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}" |
| ' |
| |
| # Docker Pushes Watcher |
| - alert: DockerPushesWatcherLiveness |
| expr: liveness_docker_watcher_pubsub_receive_s > 60 * 60 * 24 * 2 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| description: 'Docker pushes watcher has failed to recieve a pubsub event in the last 48 hours. |
| |
| https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fdocker-pushes-watcher&minLogLevel=400 |
| ' |
| |
| - alert: DockerPushesWatcherTagFailures |
| expr: docker_watcher_tag_failure > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| abbr: '{{ $labels.image }}' |
| description: 'Docker pushes watcher has failed to successfully add "prod" tag to the docker image of {{ $labels.image }} in the repo {{ $labels.repo }} |
| |
| https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fdocker-pushes-watcher&minLogLevel=400 |
| ' |
| |
| - alert: DockerPushesWatcherPushFailures |
| expr: docker_watcher_push_failure > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| abbr: '{{ $labels.image }}' |
| description: 'Docker pushes watcher has failed to successfully pushk the docker image of {{ $labels.image }} in the repo {{ $labels.repo }} |
| |
| https://console.cloud.google.com/logs/viewer?project=skia-public&resource=container%2Fcluster_name%2Fskia-public%2Fnamespace_id%2Fdefault&logName=projects%2Fskia-public%2Flogs%2Fdocker-pushes-watcher&minLogLevel=400 |
| ' |
| |
| # Fiddle |
| - alert: InsufficientFiddlerPods |
| expr: avg_over_time(pods_idle[2m]) < 5 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/%2B/master/fiddlek/PROD.md#fiddler_pods' |
| |
| - alert: NamedFiddlesFailing |
| expr: named_fiddles_total_invalid > 0 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Some named fiddles are failing. Run `kubectl logs -f -lapp=named-fiddles` |
| to see which ones.' |
| |
| - alert: FiddlerPodCommunicationErrors |
| expr: rate(run_exhaustion[20m]) * 20 * 60 > 5 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Fiddle is having trouble communicating with fiddler pods.' |
| |
| # CQ Watcher |
| - alert: TooManyCLsInCQ |
| expr: cq_watcher_in_flight_waiting_in_cq{app="cq-watcher"} >= 10 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| description: 'There are 10 CLs or more waiting in the CQ. Dry run queue: https://skia-review.googlesource.com/q/label:Commit-Queue%253D1+status:open and Commit queue: https://skia-review.googlesource.com/q/label:Commit-Queue%253D2+status:open Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_cls' |
| |
| - alert: CQTrybotRunningTooLong |
| expr: max_over_time(cq_watcher_in_flight_trybot_duration{app="cq-watcher"}[20m]) > 2700 |
| labels: |
| category: infra |
| severity: warning |
| abbr_owner_regex: rmistry@google.com:Build-.*-Android_Framework |
| annotations: |
| abbr: '{{ $labels.trybot }}' |
| description: '{{ $labels.trybot }} ran longer than 45 mins on {{ $labels.gerritURL }} Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#trybot_duration_beyond_threshold' |
| |
| - alert: TooManyCQTrybotsForCL |
| expr: max_over_time(cq_watcher_in_flight_trybot_num{app="cq-watcher"}[20m]) > 100 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| description: 'There are more than 100 CQ trybots triggered by {{ $labels.gerritURL }} Playbook: https://skia.googlesource.com/buildbot/%2B/master/cq_watcher/PROD.md#too_many_trybots_triggered' |
| |
| # datatore backups |
| - alert: BackupNotDone |
| expr: liveness_backup_success_s/60/60/24 > 7 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: skia-public |
| description: 'A backup of Cloud Datastore has not succeeded in the last week. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done' |
| |
| # alert-to-pubsub liveness |
| - alert: AlertToPubSubLiveness |
| expr: (min(liveness_alive_s{location=~".+"}) by (location)) > 90 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| abbr: '{{ $labels.location }}' |
| description: 'alert-to-pubsub for {{ $labels.location }} has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/master/am/PROD.md#alert_to_pubsub' |
| |
| # CT |
| # TODO(rmistry): Add error rate alert once logmetrics is ported to skia-public. |
| - alert: CTFEPendingTaskCount |
| expr: num_pending_tasks{app="ctfe"} >= 10 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks' |
| |
| - alert: CTFEPendingTaskNotRunning |
| expr: oldest_pending_task_status{app="ctfe"} >= 2 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks' |
| |
| - alert: AutoRollLatency |
| expr: prober{type="latency",probename="autoroll"} > 200 |
| for: 10m |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency' |
| |
| - alert: FlutterLicenseScriptFailure |
| expr: flutter_license_script_failure{app="autoroll-be-skia-flutter-autoroll"} > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| description: 'The License scripts in the Skia->Flutter roller have failed. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }} |
| https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#flutter_license_script_failure' |
| |
| # skia-flutter-autoroll takes a long time to transition because its pre-upload |
| # scripts run flutter's license script which can take around 40 minutes. |
| - alert: AutoRollLastTransition |
| expr: liveness_last_successful_autoroll_tick_s{roller="skia-flutter-autoroll"} > 50*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| abbr: 'skia-flutter-autoroll' |
| description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 50 minutes. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }} |
| ' |
| |
| # Perf |
| - alert: AndroidIngestStalled |
| expr: rate(ack[30m]) < 0.01 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'The Android Perf instances that run using --event_driven_regression_detection have stopped receiving data. Confirm via https://android-metric-ingest.skia.org/ that data is arriving.' |
| |
| - alert: AndroidRegressionDetectionTooSlow |
| expr: sum(rate(perf_regression_store_found{app=~"perf-clustering-android|skiaperf|skiaperf-android-x"}[1h])) by (app) < 0.01 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Perf failed to find any regressions in an hour. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#regression_detection_slow' |
| |
| - alert: AndroidIngestFailures |
| expr: rate(process_failures[1h]) > 0.01 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures' |
| |
| - alert: AndroidIngestLiveness |
| expr: liveness_last_successful_add_s > 300 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Liveness for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#liveness' |
| |
| |
| - alert: AndroidIngestBadFilesTooHigh |
| expr: bad_files/uploads > 0.5 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Error rate for bad_files is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#bad_files' |
| |
| - alert: AndroidTxLogFailures |
| expr: tx_log_write_failure > 0 |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#tx_log' |
| |
| - alert: PerfIngestionSuccessRateTooLow |
| expr: perfserver_ingest_successful_write/perfserver_ingest_files_received < 0.98 |
| for: 10m |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Too few ingestion files are being successfully ingested. See https://github.com/google/skia-buildbot/blob/master/perf/PROD.md#success_rate_too_low' |
| |
| - alert: PerfAndroidClusteringSlow |
| expr: sum(rate(perf_clustering_runs{app="perf-clustering-android"}[1h])*60*60) < 1 |
| for: 15m |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Android Perf Clustering (https://android-master-perf.skia.org)Rate is too low. See https://github.com/google/skia-buildbot/blob/master/perf/PROD.md#android_clustering_rate' |
| |
| - alert: PerfClusteringSlow |
| expr: rate(perf_clustering_runs{app!="skiaperf-android"}[1h])*60*60 < 1 |
| for: 60m |
| labels: |
| category: infra |
| severity: critical |
| owner: jcgregorio@google.com |
| annotations: |
| description: 'Perf Clustering Rate is too low. See https://github.com/google/skia-buildbot/blob/master/perf/PROD.md#clustering_rate' |
| |
| # Prober |
| - alert: ProbeFailure |
| expr: prober{type="failure"} > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.probename }} {{ $labels.url }}' |
| description: 'Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to |
| respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ |
| $labels.probename }}+filename%3Aprobersk.json5 for the endpoint URL.' |
| |
| - alert: ProberLiveness |
| expr: liveness_probes_s > 300 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| description: 'The prober has failed to probe in the last 5 minutes.' |
| |
| |
| # Grafana Backup |
| - alert: GrafanaBackupLiveness |
| expr: liveness_backup_s > 60*60*25 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| description: 'backup-to-gcs has failed to back up the Grafana db in the last 24 hours. Check the logs.' |
| |
| # Skia Status |
| - alert: StatusLatency |
| expr: avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| description: 'The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond.' |
| |
| # Datahopper |
| |
| - alert: FirestoreBackupTooOld |
| expr: liveness_last_successful_firestore_backup_s{app="datahopper"}/60/60/24 > 7 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| description: 'The most recent successful Firestore weekly backup was more than 7 days ago. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#firestore_weekly_backup' |
| |
| - alert: GoDEPSTooOld |
| expr: liveness_last_file_modification_s{file="go.mod"}/60/60/24 > 3 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.file }} in {{ $labels.repo }} too old' |
| description: 'The {{ $labels.file }} file in {{ $labels.repo }} was last updated more than 3 days ago. https://skia.googlesource.com/buildbot/%2B/master/infra/bots/task_drivers/update_go_deps/update_go_deps.go' |
| |
| # External RPis. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df=~"var|tmp",exported_instance=~"skia-(e-)?rpi-.+",type="free"} < 1e8 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # External bots except RPis. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df="root",exported_instance=~"skia-e-[^r].+",type="free"} < 1e9 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # Dev RPis. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df=~"var|tmp",exported_instance=~"skia-d-rpi-.+",type="free"} < 1e8 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # Dev bots except RPis. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df="root",exported_instance=~"skia-d-[^r].+",type="free"} < 1e9 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # Internal RPis. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df=~"var|tmp",exported_instance=~"skia-i-rpi-.+",type="free"} < 1e8 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Free space has fallen below 100MB on {{ $labels.exported_instance }} drive {{ $labels.df }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # Internal bots except RPis. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df="root",exported_instance=~"skia-i-[^r].+",type="free"} < 1e9 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.exported_instance }} https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # GCE machines (other than bots), root disk. |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df="root",exported_instance!~"skia-(e|i|d|rpi)-.+",type="free"} < 1e9 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Low Root Disk Space on {{ $labels.exported_instance }}. https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| # GCE bots, /b (aka /mnt/pd0) |
| - alert: DiskSpaceLow |
| expr: collectd_df_df_complex{df="mnt-pd0", exported_instance=~"skia-e-gce.+",type="free"} < 1e10 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.exported_instance }}' |
| description: 'Low Disk Space on /b for {{ $labels.exported_instance }}. https://skia.googlesource.com/buildbot/+doc/master/docs/PROD.md#diskspacelow' |
| |
| |
| # Envoy |
| - alert: EnvoyClusterBindError |
| expr: envoy_cluster_bind_errors > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: error |
| annotations: |
| abbr: '{{ $labels.envoy_cluster_name }}' |
| description: 'Envoy Cluster Bind Error for {{ $labels.envoy_cluster_name |
| }}. https://skia.googlesource.com/buildbot/+doc/master/skfe/PROD.md#cluster_bind_error' |
| |
| - alert: EnvoyRuntimeLoadError |
| expr: envoy_runtime_load_error > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: error |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Envoy Runtime Load Error for {{ $labels.app }}. https://skia.googlesource.com/buildbot/%2B/master/skfe/PROD.md#runtime_load_error' |
| |
| - alert: EnvoyClusterNotOK |
| expr: envoy_cluster_lb_local_cluster_not_ok > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: error |
| annotations: |
| abbr: '{{ $labels.envoy_cluster_name }}' |
| description: 'Envoy Cluster Not OK for {{ $labels.envoy_cluster_name }}. https://skia.googlesource.com/buildbot/%2B/master/skfe/PROD.md#envoy_cluster_lb_local_cluster_not_ok' |
| |
| # Push |
| - alert: DirtyPackages |
| expr: min_over_time(dirty_packages[25h]) >= 1 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| description: 'One or more dirty packages have been running for more than 24 hours. https://push.skia.org' |
| |
| # Gold |
| - alert: GoldNoDataAtHeadChrome |
| expr: gold_empty_commits_at_head{appgroup=~"gold-chrome.*"} >= 100 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The most recent 100 commits for {{ $labels.app }} have no data. https://skia.googlesource.com/buildbot/+doc/refs/heads/master/golden/docs/PROD.md#goldnodataathead' |