blob: dd27697c19862c961967090bcf6ea8ccdc401cc5 [file] [log] [blame]
# Alerts for things in the skia-corp cluster only.
groups:
- name: general
rules:
- alert: InternalAutoRoll
expr: autoroll_last_roll_result{roller="skia-internal-autoroll"} == 0
for: 10m
labels:
category: infra
severity: warning
owner: borenet@google.com
annotations:
description: 'The last DEPS roll into internal_test repo failed. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#autoroll_failed'
- alert: InternalAutoRoll24H
expr: liveness_last_autoroll_landed_s{roller="skia-internal-autoroll"}/60/60 > 24
labels:
category: infra
severity: warning
owner: borenet@google.com
annotations:
description: 'The last-landed roll into internal_test was over 24h ago. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#no_rolls_24h'
- alert: TrybotUpdaterErrorRate
expr: rate(num_log_lines{level="ERROR",app=~"trybot-updater.*"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}'
- alert: SkCQBackendErrorRate
expr: rate(num_log_lines{level="ERROR",app=~"skcq-be"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}'
# Service account keys checker
- alert: SAKeysCheckerErrorRate
expr: rate(num_log_lines{level="ERROR",app="sa-keys-checker"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}'
- alert: SAKeyExpiringSoon
expr: 0 < sa_key_expiration_s{app="sa-keys-checker"} < 30*24*60*60
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.sa }} - {{ $labels.key }}'
description: 'The key {{ $labels.key }} of {{ $labels.sa }} in project {{ $labels.project }} is going to expire in less than 30 days.
Production Manual: https://skia.googlesource.com/buildbot/%2B/main/sa-keys-checker/PROD.md#sa_key_expiring_soon'
- alert: SAKeyExpired
expr: sa_key_expiration_s{app="sa-keys-checker"} < 0
labels:
category: infra
severity: warning
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.sa }} - {{ $labels.key }}'
description: 'The key {{ $labels.key }} of {{ $labels.sa }} in project {{ $labels.project }} has expired.
Production Manual: https://skia.googlesource.com/buildbot/%2B/main/sa-keys-checker/PROD.md#sa_key_expired'