blob: 0364b1cd55529df30b195f776e08d09a2d442d60 [file] [log] [blame]
# Alerts we need no matter what is running in the cluster.
groups:
- name: general
rules:
# General -- each of these alerts has two forms:
# - Scraped by annotation: these have a kubernetes_pod_name label
# - Scraped by config: these have instance and job labels.
- alert: InstanceDown
expr: up{kubernetes_pod_name!=""} == 0
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.kubernetes_pod_name }}'
description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} has been down
for more than 5 minutes. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'
- alert: InstanceDown
expr: up{kubernetes_pod_name=""} == 0
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.instance }}'
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 5 minutes. Logs:
kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
'
- alert: CrashLoop
expr: max_over_time(liveness_uptime_s{kubernetes_pod_name!=""}[6m]) < 60 * 3
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.kubernetes_pod_name }}'
description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} is crashing on
startup. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'
- alert: TooManyGoRoutines
expr: go_goroutines{app=~".+"} > 3000
for: 2m
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.app }}'
description: 'Too many Go routines in {{ $labels.kubernetes_pod_name }} for app
{{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'
- alert: TooManyOpenFDs
expr: process_open_fds{app=~".+"} > 2000
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.app }}'
description: 'Too many open file handles on {{ $labels.kubernetes_pod_name }} for app
{{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'
- alert: PersistentVolumeLowSpace
expr: (kubelet_volume_stats_used_bytes /kubelet_volume_stats_capacity_bytes) > 0.9
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.persistentvolumeclaim }}'
description: '{{ $labels.persistentvolumeclaim }} is more than 90% full.'
- alert: ContainerVolumeLowSpace
expr: (container_fs_usage_bytes/container_fs_limit_bytes) > 0.9
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.instance }}'
description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.'
- alert: AutoRollBackendErrorRate
expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate for autoroll on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'
- alert: AutoRollLastTransition
expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 20*60
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
abbr: '{{ $labels.roller }}'
description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 20 minutes.
https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
'
- alert: HighExternalQPS
expr: sum(rate(http_request_metrics{host!="www.googleapis.com"}[30m])) by (host) > 25
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.host }}'
description: 'QPS to {{ $labels.host }} is high. Verify that this is expected.'
- alert: HighExternalQPSGoogleAPIs
expr: sum(rate(http_request_metrics{host="www.googleapis.com"}[30m])) > 60
labels:
category: infra
severity: warning
annotations:
description: 'QPS to www.googleapis.com is high. Verify that this is expected.'
- alert: AutoRollGetSheriffFailed
expr: autoroll_get_sheriff_success == 0
for: 2h
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
abbr: '{{ $labels.roller }}'
description: 'Autoroll on {{ $labels.app }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty.
https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
'