# Alerts we need no matter what is running in the cluster.
groups:
- name: general
  rules:

  # General -- each of these alerts has two forms:
  # - Scraped by annotation: these have a kubernetes_pod_name label
  # - Scraped by config: these have instance and job labels.
  - alert: InstanceDown
    expr: up{kubernetes_pod_name!=""} == 0
    for: 5m
    labels:
      category: infra
      severity: critical
    annotations:
      abbr: '{{ $labels.kubernetes_pod_name }}'
      description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} has been down
        for more than 5 minutes. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
        https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

  - alert: InstanceDown
    expr: up{kubernetes_pod_name=""} == 0
    for: 5m
    labels:
      category: infra
      severity: critical
    annotations:
      abbr: '{{ $labels.instance }}'
      description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
        for more than 5 minutes. Logs:

          kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

          https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
          '

  - alert: CrashLoop
    expr: max_over_time(liveness_uptime_s{kubernetes_pod_name!=""}[6m]) < 60 * 3
    for: 5m
    labels:
      category: infra
      severity: critical
    annotations:
      abbr: '{{ $labels.kubernetes_pod_name }}'
      description: 'Pod {{ $labels.kubernetes_pod_name }} for app {{ $labels.app }} is crashing on
        startup. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
        https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

  - alert: TooManyGoRoutines
    expr: go_goroutines{kubernetes_pod_name!=""} > 3000
    for: 2m
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.kubernetes_pod_name }}'
      description: 'Too many Go routines in {{ $labels.kubernetes_pod_name }} for app
        {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
        https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

  - alert: TooManyGoRoutines
    expr: go_goroutines{job!="kubernetes-nodes",kubernetes_pod_name=""} > 3000
    for: 2m
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.job }}'
      description: 'Too many Go routines in {{ $labels.job }} running on {{ $labels.instance
        }}. Logs:

          kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

          https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
        '

  - alert: TooManyOpenFDs
    expr: process_open_fds{kubernetes_pod_name!=""} > 2000
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.kubernetes_pod_name }}'
      description: 'Too many open file handles on {{ $labels.kubernetes_pod_name }} for app
        {{ $labels.app }}. Logs: `kubectl logs {{ $labels.kubernetes_pod_name }}`
        https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=resource.type%3D%22container%22%0Aresource.labels.pod_id%3D%22{{ $labels.kubernetes_pod_name }}%22'

  - alert: TooManyOpenFDs
    expr: process_open_fds{kubernetes_pod_name=""} > 2000
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.job }}'
      description: 'Too many open file handles for {{ $labels.job }} running on {{ $labels.instance
        }}. Logs:

          kubectl logs -l app={{ reReplaceAll `:[0-9]+` `` $labels.instance }} -c {{ $labels.job }}

          https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&advancedFilter=logName%3D"projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.job }}"
        '

  - alert: PersistentVolumeLowSpace
    expr: (kubelet_volume_stats_used_bytes /kubelet_volume_stats_capacity_bytes) > 0.9
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.persistentvolumeclaim }}'
      description: '{{ $labels.persistentvolumeclaim }} is more than 90% full.'

  - alert: ContainerVolumeLowSpace
    expr: (container_fs_usage_bytes/container_fs_limit_bytes) > 0.9
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.instance }}'
      description: '{{ $labels.device }} on {{ $labels.instance }} in pool {{ $labels.cloud_google_com_gke_nodepool }} is more than 90% full.'

  - alert: AutoRollBackendErrorRate
    expr: rate(num_log_lines{level="ERROR",app=~"autoroll-be.*"}[1h]) > 0.001
    labels:
      category: infra
      severity: critical
      owner: borenet@google.com
    annotations:
      abbr: '{{ $labels.app }}'
      description: 'The error rate for autoroll on {{ $labels.app }} is too high.
      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
      https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#error_rate'

  - alert: AutoRollLastTransition
    expr: liveness_last_successful_autoroll_tick_s{roller!="skia-flutter-autoroll"} > 10*60
    labels:
      category: infra
      severity: critical
      owner: borenet@google.com
    annotations:
      abbr: '{{ $labels.roller }}'
      description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 10 minutes.
      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
      '

  - alert: HighExternalQPS
    expr: sum(rate(http_request_metrics{host!="www.googleapis.com"}[30m])) by (host) > 25
    labels:
      category: infra
      severity: warning
    annotations:
      abbr: '{{ $labels.host }}'
      description: 'QPS to {{ $labels.host }} is high. Verify that this is expected.'

  - alert: HighExternalQPSGoogleAPIs
    expr: sum(rate(http_request_metrics{host="www.googleapis.com"}[30m])) > 60
    labels:
      category: infra
      severity: warning
    annotations:
      description: 'QPS to www.googleapis.com is high. Verify that this is expected.'

  - alert: AutoRollGetSheriffFailed
    expr: autoroll_get_sheriff_success == 0
    for: 2h
    labels:
      category: infra
      severity: critical
      owner: borenet@google.com
    annotations:
      abbr: '{{ $labels.roller }}'
      description: 'Autoroll on {{ $labels.app }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty.
      https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
      '
