| # Alerts we need no matter what is running in the cluster. |
| groups: |
| - name: general |
| rules: |
| |
| # General -- each of these alerts has two forms: |
| # - Scraped by annotation: these have a kubernetes_pod_name label |
| # - Scraped by config: these have instance and job labels. |
| - alert: InstanceDown |
| expr: up{kubernetes_pod_name!=""} == 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.kubernetes_pod_name }}' |
| description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} has been down |
| for more than 5 minutes. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |
| |
| - alert: InstanceDown |
| expr: up{kubernetes_pod_name=""} == 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} of job {{ $labels.job }} has been down |
| for more than 5 minutes. Logs: |
| |
| kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }} |
| |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}" |
| ' |
| |
| - alert: PodContainerCreatingTooLong |
| expr: k8s_pod_status{status="ContainerCreating"} > 30 * 60 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.pod }}' |
| description: '{{ $labels.pod }} of container {{ $labels.container }} in project {{ $labels.project }} has been in {{ $labels.status }} state for more than 30 minutes.' |
| |
| - alert: PodTerminatingTooLong |
| expr: k8s_pod_status{status="Terminating"} > 30 * 60 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.pod }}' |
| description: '{{ $labels.pod }} of container {{ $labels.container }} in project {{ $labels.project }} has been in {{ $labels.status }} state for more than 30 minutes.' |
| |
| - alert: CrashLoop |
| expr: max_over_time(liveness_uptime_s{kubernetes_pod_name!=""}[6m]) < 60 * 3 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.kubernetes_pod_name }}' |
| description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} is crashing on |
| startup. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: TooManyGoRoutines |
| expr: go_goroutines{app=~".+"} > 3000 |
| for: 2m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Too many Go routines in {{ $labels.kubernetes_pod_name }} for app |
| {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: TooManyOpenFDs |
| expr: process_open_fds{app=~".+"} > 30000 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Too many open file handles on {{ $labels.kubernetes_pod_name }} for app |
| {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: PersistentVolumeLowSpace |
| expr: (kubelet_volume_stats_used_bytes /kubelet_volume_stats_capacity_bytes) > 0.9 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.persistentvolumeclaim }}' |
| description: '{{ $labels.persistentvolumeclaim }} is more than 90% full.' |
| |
| - alert: ContainerVolumeLowSpace |
| expr: (container_fs_usage_bytes/container_fs_limit_bytes) > 0.9 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.' |
| |
| - alert: AutoRollBackendErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for autoroll on {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }} |
| https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#error_rate' |
| |
| - alert: AutoRollFrontendErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"autoroll-fe.*"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for autoroll on {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }} |
| https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#error_rate' |
| |
| - alert: AutoRollLastTransition |
| expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 20*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.roller }}' |
| description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 20 minutes. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: AutoRollCLUploadFailure |
| expr: autoroll_cl_upload_failures{} > 2 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.roller }}' |
| description: '{{ $labels.app }} is failing to upload CLs. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: HighOutgoingQPS |
| expr: sum(rate(http_request_metrics{host!~".*googleapis.*"}[30m])) by (host) > 25 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.host }}' |
| description: 'QPS to {{ $labels.host }} is high. Verify that this is expected.' |
| |
| - alert: HighOutgoingQPSByApp |
| expr: sum(rate(http_request_metrics{host!~".*googleapis.*"}[30m])) by (host,app) > 15 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.host }}' |
| description: 'QPS to {{ $labels.host }} from {{ $labels.app }} is high. Verify that this is expected.' |
| |
| - alert: HighOutgoingQPSGoogleAPIs |
| expr: sum(rate(http_request_metrics{host=~".*googleapis.*"}[30m])) by (host) > 100 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| description: 'QPS to googleapis is high. Verify that this is expected.' |
| |
| - alert: HighOutgoingQPSGoogleAPIsByApp |
| expr: sum(rate(http_request_metrics{host=~".*googleapis.*"}[30m])) by (host,app) > 50 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| description: 'QPS to googleapis from {{ $labels.app }} is high. Verify that this is expected.' |
| |
| - alert: HighFirestoreUsageSustained |
| expr: sum(rate(firestore_ops_count{count="rows", app!~"gold.+", app!~"bugs-central"}[24h])) by (app) > 25 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Firestore usage from {{ $labels.app }} over the last 24h is high. Verify that this is expected, and adjust the alert threshold if it is. Dashboard: https://grafana2.skia.org/d/H4cyODhZz/firestore-read-write-rate?orgId=1&from=now-24h&to=now' |
| |
| - alert: HighFirestoreUsageBurst |
| expr: sum(rate(firestore_ops_count{count="rows"}[30m])) by (app) > 1000 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Firestore usage from {{ $labels.app }} over the last 30m is high. Verify that this is expected, and adjust the alert threshold if it is. Dashboard: https://grafana2.skia.org/d/H4cyODhZz/firestore-read-write-rate?orgId=1&from=now-30m&to=now' |
| |
| - alert: AutoRollGetReviewersFailed |
| expr: autoroll_get_reviewers_success == 0 |
| for: 2h |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.roller }}' |
| description: 'Autoroll on {{ $labels.app }} has failed to obtain the current set of reviewers for more than 2 hours. Please verify that the reviewers endpoint is working. |
| {{ $labels.reviewer_source }} |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }} |
| ' |
| |
| - alert: AutoRollGetReviewersEmpty |
| expr: autoroll_get_reviewers_nonempty == 0 |
| for: 72h |
| labels: |
| category: infra |
| severity: warning |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.roller }}' |
| description: 'Autoroll on {{ $labels.app }} has found no reviewers for more than 72 hours. Please verify that the reviewer endpoints are working and that the rotation schedule is not empty. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }} |
| ' |
| |
| # Skia Status |
| - alert: StatusLatency |
| expr: prober{type="latency",probename="skiastatus_json"}/1000 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.probename }}' |
| description: 'The endpoint for {{ $labels.probename }} took more than 10s to respond. https://skia.googlesource.com/buildbot/%2B/main/status/PROD.md#http_latency Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: StatusIncrementalCacheUpdate |
| expr: liveness_last_successful_incremental_cache_update_s > 5*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'IncrementalCache UpdateLoop on {{ $labels.app }} has failed to update data for more than 5 minutes. Playbook: https://skia.googlesource.com/buildbot/%2B/main/status/PROD.md#incremental_cache_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: StatusErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"status.*"}[2m]) > 0.05 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for status on {{ $labels.app }} is too high. https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| # Task Scheduler |
| |
| - alert: TaskSchedulerLiveness |
| expr: liveness_last_successful_task_scheduling_s/60 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#scheduling_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: TaskSchedulerUpdateReposLiveness |
| expr: liveness_last_successful_repo_update_s/60 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update repos and insert new jobs for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#update_repos_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: TaskSchedulerBuildbucketPollLiveness |
| expr: liveness_last_successful_poll_buildbucket_for_new_tryjobs_s/60 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to poll Buildbucket for new tryjobs for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#poll_buildbucket_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: TaskSchedulerBuildbucketUpdateLiveness |
| expr: liveness_last_successful_update_buildbucket_tryjob_state_s/60 > 2 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to send updates to Buildbucket for in-progress and completed tryjobs for the last 2 minutes. https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#update_buildbucket_failed Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: TaskSchedulerLatency |
| expr: prober{type="latency",probename="task_scheduler"} > 300 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.probename }}' |
| description: 'The endpoint for {{ $labels.probename }} took more than 300ms to respond. https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#http_latency Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| - alert: TaskSchedulerErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"task-scheduler.*"}[2m]) > 0.05 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for task_scheduler on {{ $labels.app }} is too high. https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| |
| - alert: TaskSchedulerTooManyCandidates |
| expr: task_candidate_count > 1500 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'There are too many task candidates for dimensions: {{ $labels.dimensions }} https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#too_many_candidates' |
| |
| - alert: OverdueMetricsLiveness |
| expr: liveness_last_successful_overdue_metrics_update_s/60 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update overdue_job_specs_s for the last 10 minutes. Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#overdue_metrics_liveness Logs: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&minLogLevel=200&interval=PT1H&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}' |
| |
| # These jobs have tasks with an expiration of 4 hours, and we allow 2 attempts, so they should |
| # normally finish within 8 hours. |
| - alert: OverdueJobSpec |
| expr: overdue_job_specs_s{job_trigger=~"|main",job_name!~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 8 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: '{{ $labels.job_name }} has not finished for any commit in the last 8 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+show/main/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#overdue_job_spec' |
| |
| # These jobs have tasks with an expiration of 9 hours, and we allow 2 attempts, so they should |
| # normally finish within 18 hours. |
| - alert: OverdueJobSpecLong |
| expr: overdue_job_specs_s{job_trigger=~"|main",job_name=~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 18 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: '{{ $labels.job_name }} has not finished for any commit in the last 9 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+show/main/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#overdue_job_spec' |
| |
| - alert: OverdueJobSpecNightly |
| expr: overdue_job_specs_s{job_trigger="nightly"}/60/60 > 28 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: '{{ $labels.job_name }} has not completed in the last 28 hours (nightly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+show/main/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#overdue_job_spec' |
| |
| - alert: OverdueJobSpecWeekly |
| expr: overdue_job_specs_s{job_trigger="weekly"}/60/60 > 7*24+4 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: '{{ $labels.job_name }} has not completed in the last week + 4 hours (weekly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+show/main/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#overdue_job_spec' |
| |
| - alert: LatestJobAgeNightly |
| expr: latest_job_age_s{job_trigger="nightly"}/60/60 > 25 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: '{{ $labels.job_name }} has not been triggered in the last 25 hours (nightly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+show/main/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#latest_job_age' |
| |
| - alert: LatestJobAgeWeekly |
| expr: latest_job_age_s{job_trigger="weekly"}/60/60 > 7*24+1 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: '{{ $labels.job_name }} has not been triggered in the last week + 1 hour (weekly job). Double check whether the periodic triggers are running correctly (Job defined here: {{ $labels.repo }}/+show/main/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/main/task_scheduler/PROD.md#latest_job_age' |
| |
| # Datahopper |
| |
| - alert: DatahopperErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"datahopper.*"}[10m]) > 5 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: JobMetricsLiveness |
| expr: liveness_last_successful_job_metrics_update_s/60 > 30 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update job metrics for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#job_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: BotCoverageMetricsLiveness |
| expr: liveness_last_successful_bot_coverage_metrics_s/60 > 60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update bot coverage metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#bot_coverage_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: SwarmingTaskMetricsLiveness |
| expr: liveness_last_successful_swarming_task_metrics_s/60 > 60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update swarming task metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#swarming_task_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: EventMetricsLiveness |
| expr: liveness_last_successful_event_metrics_update_s/60 > 30 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update event metrics for {{ $labels.measurement }} for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#event_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: SwarmingBotMetricsLiveness |
| expr: liveness_last_successful_report_bot_metrics_s/60 > 10 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.pool }}' |
| description: '{{ $labels.app }} has failed to update swarming task metrics for pool {{ $labels.pool }} on {{ $labels.server }} for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#swarming_bot_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: FirestoreBackupMetricsLiveness |
| expr: liveness_last_successful_firestore_backup_metrics_update_s/60 > 15 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has failed to update Firestore backup metrics for the last 15 minutes. https://skia.googlesource.com/buildbot/%2B/main/datahopper/PROD.md#firestore_backup_metrics https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| # Swarming |
| |
| - alert: BotMissing |
| expr: swarming_bots_last_seen{bot!~"(ct-gce-.*)|(build4.+device.+)|skia-rpi-template"}/1000/1000/1000/60 > 15 |
| labels: |
| category: infra |
| severity: critical |
| abbr_owner_regex: rmistry@google.com:^build[0-9]+-m5$ |
| annotations: |
| abbr: '{{ $labels.bot }}' |
| description: 'Swarming bot {{ $labels.bot }} is missing. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance' |
| |
| - alert: BotUnemployed |
| expr: swarming_bots_last_task{pool=~"Skia.*",bot!="skia-rpi-template"}/1000/1000/1000/60/60 >= 72 |
| labels: |
| category: infra |
| severity: critical |
| abbr_owner_regex: rmistry@google.com:^build[0-9]+-m5$ |
| annotations: |
| abbr: '{{ $labels.bot }}' |
| description: 'Swarming bot {{ $labels.bot }} has not run a job in 72 hours. Maybe its dimensions need changing? https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance' |
| |
| - alert: BotQuarantined |
| expr: swarming_bots_quarantined{device_state!~"(too_hot)|(low_battery)",bot!="skia-rpi-template"} >= 1 |
| for: 10m |
| labels: |
| category: infra |
| severity: critical |
| abbr_owner_regex: rmistry@google.com:^build[0-9]+-m5$ |
| annotations: |
| abbr: '{{ $labels.bot }}' |
| description: 'Swarming bot {{ $labels.bot }} is quarantined. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance' |
| |
| - alert: DeviceUnhealthy |
| expr: avg(avg_over_time(swarming_bots_quarantined{device_state=~"(too_hot)|(low_battery)",bot!="skia-rpi-template"}[1h])) by (swarming, bot, device_state) >= 1 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.bot }}' |
| description: 'Swarming bot {{ $labels.bot }} is quarantined because the device is {{ $labels.device_state }} and has not resolved itself in 1+ hours. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance' |
| |
| - alert: TestMachineUnhealthy |
| expr: machine_processor_device_time_in_recovery_mode_s >= 60*60 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.id }}' |
| description: 'Test machine {{ $labels.id }} has been in recovery >1 hour. https://machines.skia.org/?search={{ $labels.id }} https://goto.google.com/skolo-maintenance' |
| |
| - alert: BotUptime |
| expr: swarming_bots_uptime_s{bot!~"skia-rpi2.*"} / 60 / 60 > 36 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.bot }}' |
| description: 'Swarming bot {{ $labels.bot }} has gone too long without a reboot. Check the events on the Swarming bot page and reboot manually if necessary. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance' |
| |
| # Alerts for supported branches. |
| |
| - alert: MissingCQConfigForSupportedBranch |
| expr: cq_cfg_branch_exists == 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.branch }}' |
| description: 'There is no commit queue config entry for supported branch {{ $labels.branch }} in {{ $labels.repo }}. Either an entry needs to be added to {{ $labels.repo }}/+show/infra/config/main.star or the branch needs to be marked as not supported in {{ $labels.repo }}/+show/infra/config/supported-branches.json' |
| |
| - alert: NoSuchCQTryjobForSupportedBranch |
| expr: cq_cfg_tryjob_exists == 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.branch }}' |
| description: 'The commit queue config for supported branch {{ $labels.branch }} in {{ $labels.repo }} references unknown job {{ $labels.tryjob }}. Either the job needs to be removed or renamed in {{ $labels.repo }}/+show/infra/config/main.star or the job needs to be added to {{ $labels.repo }}/+show/{{ $labels.branch}}/infra/bots/tasks.json' |
| |
| - alert: NoBotsExistForTryjobOnSupportedBranch |
| expr: cq_cfg_bot_exists_for_tryjob == 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.branch }}' |
| description: 'There are no bots which can run the tasks for {{ $labels.tryjob }} on supported branch {{ $labels.branch }} in {{ $labels.repo }}. Either the dimensions for the tasks used by the job need to be updated in {{ $labels.repo }}/+show/{{ $labels.branch}}/infra/bots/tasks.json or bots need to be added which can run the tasks. See the logs for details: https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| # K8s Checker |
| - alert: K8sCheckerLiveness |
| expr: liveness_k8s_checker_s > 300 |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| description: 'k8s_checker has failed to run in the last 5 minutes on cluster {{ $labels.cluster }}. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#K8sCheckerLiveness' |
| |
| - alert: EvictedPod |
| expr: evicted_pod_metric == 1 |
| for: 5m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.pod }}' |
| description: 'A pod has been Evicted. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#EvictedPod' |
| |
| - alert: DirtyCommittedK8sImage |
| expr: dirty_committed_image_metric == 1 |
| for: 2h |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.yaml }}' |
| description: 'There is a dirty committed image {{ $labels.committedImage }} in {{ $labels.repo }}/+show/refs/heads/main/{{ $labels.project }}/{{ $labels.yaml }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#DirtyCommittedK8sImage' |
| |
| - alert: DirtyRunningK8sConfig |
| expr: dirty_config_metric == 1 |
| for: 2h |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.container }}' |
| description: 'For app {{ $labels.exported_app }} and container {{ $labels.container }} the running image differs from the image in {{ $labels.repo }}/+show/refs/heads/main/{{ $labels.project }}/{{ $labels.yaml }} : {{ $labels.liveImage }} != {{ $labels.committedImage }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#DirtyRunningK8sConfig' |
| |
| - alert: StaleK8sImage |
| expr: stale_image_metric > 30 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.container }}' |
| description: 'For app {{ $labels.exported_app }} and container {{ $labels.container }} the running/committed image {{ $labels.liveImage }} is > 30 days old. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#StaleK8sImage' |
| |
| - alert: CheckedInK8sAppNotRunning |
| expr: app_running_metric{exported_app!~"gold-goldpushk.*test.*"} == 0 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_app }}' |
| description: 'The app {{ $labels.exported_app }} is checked into {{ $labels.repo }}/+show/refs/heads/main/{{ $labels.project }}/{{ $labels.yaml }} but is not running in {{ $labels.project }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#CheckedInK8sAppNotRunning' |
| |
| - alert: CheckedInK8sContainerNotRunning |
| expr: container_running_metric == 0 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.container }}' |
| description: 'The container {{ $labels.container }} of app {{ $labels.exported_app }} is checked into {{ $labels.repo }}/+show/refs/heads/main/{{ $labels.project }}/{{ $labels.yaml }} but is not running in {{ $labels.project }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#CheckedInK8sContainerNotRunning' |
| |
| - alert: RunningK8sAppNotCheckedIn |
| expr: running_app_has_config_metric == 0 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.exported_app }}' |
| description: 'The app {{ $labels.exported_app }} is running in {{ $labels.project }} but is not checked into {{ $labels.repo }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#RunningK8sAppNotCheckedIn' |
| |
| - alert: RunningK8sContainerNotCheckedIn |
| expr: running_container_has_config_metric == 0 |
| for: 15m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.container }}' |
| description: 'The app {{ $labels.exported_app }} is running in {{ $labels.project }} but is not checked into {{ $labels.repo }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#RunningK8sContainerNotCheckedIn' |
| |
| - alert: PodRestartingFrequently |
| expr: delta(pod_restart_count{}[1h]) >= 2 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.container }}' |
| description: 'Container {{ $labels.container }} of app {{ $labels.exported_app }} on cluster {{ $labels.cluster }} has restarted at least twice in the last hour. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#PodRestartingFrequently' |
| |
| - alert: TooManyPodRestarts |
| expr: pod_restart_count{} >= 20 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.container }}' |
| description: 'Container {{ $labels.container }} of app {{ $labels.exported_app }} on cluster {{ $labels.cluster }} has restarted at least 20 times since initial deployment. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/k8s-checker/PROD.md#TooManyPodRestarts' |
| |
| - alert: AppRunningInDefaultNamespace |
| expr: pod_running{namespace="default",cluster!~"skia-public|skia-corp"} > 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.exported_app }}' |
| description: 'App {{ $labels.exported_app }} is running in the default namespace in cluster {{ $labels.cluster }}.' |
| |
| |
| # GitSync. |
| - alert: GitSyncStalled |
| expr: liveness_last_successful_git_sync_s > 5*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.repo }}' |
| description: 'gitsync has failed to update {{ $labels.repo }} for the last 5 minutes. Check out pod {{ $labels.statefulset_kubernetes_io_pod_name }} in project {{ $labels.project }}. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/gitsync/PROD.md#GitSyncStalled |
| |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }} |
| ' |
| |
| - alert: GitSyncErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"gitsync.*"}[5m]) > 0.005 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/gitsync/PROD.md#GitSyncErrorRate' |
| |
| # Gold Alerts |
| - alert: GoldIgnoreMonitoring |
| expr: liveness_gold_expired_ignore_rules_monitoring_s > 20*60 |
| labels: |
| category: infra |
| severity: warning |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has not checked for expired ignore rules in a while. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldignoremonitoring' |
| |
| - alert: GoldPollingIngestionStalled |
| expr: liveness_gold_ingestion_s{metric="since_last_successful_poll"} > 1*60*60 |
| labels: |
| category: infra |
| severity: warning |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has not been able to poll the bucket for missed files to ingest for a while. See https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldpollingingestionstalled' |
| |
| - alert: GoldStreamingIngestionStalled |
| expr: liveness_gold_ingestion_s{metric="since_last_successful_streaming_result"} > 24*60*60 |
| labels: |
| category: infra |
| severity: warning |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has not successfully ingested files via streaming in a long time. See https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldstreamingingestionstalled' |
| |
| - alert: GoldCommentingStalled |
| expr: liveness_periodic_tasks_s{task="commentOnCLs"} > 20*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'It has been at least 20 minutes since the Gold went through all open CLs in for {{ $labels.app }} to maybe comment them on. Some process might have hung. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldcommentingstalled' |
| |
| - alert: GoldDigestSyncingStalled |
| expr: liveness_periodic_tasks_s{task="syncKnownDigests"} > 60*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'It has been at least 60 minutes since the Gold finished syncing known digests for {{ $labels.app }}. https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#golddigestsyncingstalled' |
| |
| |
| - alert: GoldHeavyTraffic |
| expr: rate(gold_rpc_call_counter[5m]) > 50 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} is experiencing high traffic on {{ $labels.route }} https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldheavytraffic' |
| |
| - alert: GoldDiffCalcBehind |
| expr: diffcalculator_workqueuesize > 1000 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has too many diffs piled up for calculation https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#golddiffcalcbehind' |
| |
| - alert: GoldDiffCalcStale |
| expr: diffcalculator_workqueuefreshness{stat="max"} > 1800 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} has some diffs that have not been updated in over 30 minutes https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#golddiffcalcstale' |
| |
| - alert: GoldIngestionFailures |
| expr: delta(gold_ingestion_failure{}[10m])/(delta(gold_ingestion_success{}[10m])+delta(gold_ingestion_failure{}[10m])) > .1 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} is failing to ingest more than 10 percent of files https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldingestionfailures' |
| |
| - alert: GoldSQLBackupError |
| expr: periodictasks_backup_error > 0 |
| labels: |
| category: infra |
| severity: critical |
| owner: kjlubick@google.com |
| annotations: |
| abbr: '{{ $labels.appgroup }}' |
| description: 'The {{ $labels.database }} SQL database is failing to be backed up https://skia.googlesource.com/buildbot/+doc/refs/heads/main/golden/docs/PROD.md#goldsqlbackuperror' |
| |
| # Git is supposed to be obtained through CIPD. |
| - alert: GitNotFromCIPD |
| expr: git_from_cipd == 0 |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: '{{ $labels.app }} is using a version of Git which is not obtained via CIPD.' |
| |
| # k8s-deployer |
| # This is currently only used in the skia-infra-public-dev cluster, but it will |
| # eventually be used in more places. |
| - alert: K8sDeployerLiveness |
| expr: liveness_k8s_deployer_s > 10*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.cluster }}' |
| description: 'k8s-deployer in {{ $labels.cluster }} has failed to apply changes for more than 10 minutes. |
| https://pantheon.corp.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.container_name%3D%22k8s-deployer%22;cursorTimestamp=2022-03-21T17:24:45.250169905Z?project={{ $labels.project }}' |
| |
| |
| # Cron Jobs |
| - alert: CronJobFailed |
| expr: kube_job_failed{condition="true"} > 0 |
| labels: |
| category: infra |
| severity: error |
| owner: jcgregorio@google.com |
| annotations: |
| abbr: '{{ $labels.job_name }}' |
| description: 'Job has failed {{ $labels.kubernetes_pod_name }}.' |