| # Alerts we need no matter what is running in the cluster. |
| groups: |
| - name: general |
| rules: |
| |
| # General -- each of these alerts has two forms: |
| # - Scraped by annotation: these have a kubernetes_pod_name label |
| # - Scraped by config: these have instance and job labels. |
| - alert: InstanceDown |
| expr: up{kubernetes_pod_name!=""} == 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.kubernetes_pod_name }}' |
| description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} has been down |
| for more than 5 minutes. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |
| |
| - alert: InstanceDown |
| expr: up{kubernetes_pod_name=""} == 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} of job {{ $labels.job }} has been down |
| for more than 5 minutes. Logs: |
| |
| kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }} |
| |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}" |
| ' |
| |
| - alert: CrashLoop |
| expr: max_over_time(liveness_uptime_s{kubernetes_pod_name!=""}[6m]) < 60 * 3 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.kubernetes_pod_name }}' |
| description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} is crashing on |
| startup. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |
| |
| - alert: TooManyGoRoutines |
| expr: go_goroutines{app=~".+"} > 3000 |
| for: 2m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Too many Go routines in {{ $labels.kubernetes_pod_name }} for app |
| {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |
| |
| - alert: TooManyOpenFDs |
| expr: process_open_fds{app=~".+"} > 2000 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Too many open file handles on {{ $labels.kubernetes_pod_name }} for app |
| {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |
| |
| - alert: PersistentVolumeLowSpace |
| expr: (kubelet_volume_stats_used_bytes /kubelet_volume_stats_capacity_bytes) > 0.9 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.persistentvolumeclaim }}' |
| description: '{{ $labels.persistentvolumeclaim }} is more than 90% full.' |
| |
| - alert: ContainerVolumeLowSpace |
| expr: (container_fs_usage_bytes/container_fs_limit_bytes) > 0.9 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.' |
| |
| - alert: AutoRollBackendErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate for autoroll on {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }} |
| https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate' |
| |
| - alert: AutoRollLastTransition |
| expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 20*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.roller }}' |
| description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 20 minutes. |
| https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }} |
| ' |
| |
| - alert: HighExternalQPS |
| expr: sum(rate(http_request_metrics{host!="www.googleapis.com"}[30m])) by (host) > 25 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.host }}' |
| description: 'QPS to {{ $labels.host }} is high. Verify that this is expected.' |
| |
| - alert: HighExternalQPSGoogleAPIs |
| expr: sum(rate(http_request_metrics{host="www.googleapis.com"}[30m])) > 60 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| description: 'QPS to www.googleapis.com is high. Verify that this is expected.' |
| |
| - alert: AutoRollGetSheriffFailed |
| expr: autoroll_get_sheriff_success == 0 |
| for: 2h |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.roller }}' |
| description: 'Autoroll on {{ $labels.app }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty. |
| https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }} |
| ' |