| # Alerts for things in the skolo-rack4 cluster only. |
| # |
| groups: |
| - name: general |
| rules: |
| - alert: CrashLoop |
| expr: max_over_time(liveness_uptime_s{kubernetes_pod_name=""}[6m]) < 60 * 3 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} of job {{ $labels.job }} is crashing on |
| startup. Logs: |
| |
| kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }} |
| |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}" |
| ' |
| - alert: InstanceDown |
| expr: up{kubernetes_pod_name=""} == 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: '{{ $labels.instance }} of job {{ $labels.job }} has been down |
| for more than 5 minutes. Logs: |
| |
| kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }} |
| ' |
| - alert: TooManyGoRoutines |
| expr: go_goroutines{app=~".+"} > 3000 |
| for: 2m |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Too many Go routines in {{ $labels.kubernetes_pod_name }} for app |
| {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |
| |
| - alert: TooManyOpenFDs |
| expr: process_open_fds{app=~".+"} > 5000 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'Too many open file handles on {{ $labels.kubernetes_pod_name }} for app |
| {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}` |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22' |