blob: 97b3e8bc837189355837740121d49555be33bd05 [file] [log] [blame]
# Alerts for things in the skia-corp cluster only.
groups:
- name: general
rules:
- alert: InternalAutoRoll
expr: autoroll_last_roll_result{roller="skia-internal-autoroll"} == 0
for: 10m
labels:
category: infra
severity: warning
owner: borenet@google.com
annotations:
description: 'The last DEPS roll into internal_test repo failed. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#autoroll_failed'
- alert: InternalAutoRoll24H
expr: liveness_last_autoroll_landed_s{roller="skia-internal-autoroll"}/60/60 > 24
labels:
category: infra
severity: warning
owner: borenet@google.com
annotations:
description: 'The last-landed roll into internal_test was over 24h ago. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#no_rolls_24h'
- alert: TrybotUpdaterErrorRate
expr: rate(num_log_lines{level="ERROR",app=~"trybot-updater.*"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}'
- alert: SkCQBackendErrorRate
expr: rate(num_log_lines{level="ERROR",app=~"skcq-be"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}'
# Service account keys checker
- alert: SAKeysCheckerErrorRate
expr: rate(num_log_lines{level="ERROR",app="sa-keys-checker"}[1h]) > 0.001
labels:
category: infra
severity: critical
owner: rmistry@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'The error rate on {{ $labels.app }} is too high.
https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}'
- alert: SAKeyExpiringSoon
expr: 0 < sa_key_expiration_s{app="sa-keys-checker"} < 30*24*60*60
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.sa }} - {{ $labels.key }}'
description: 'The key {{ $labels.key }} of {{ $labels.sa }} in project {{ $labels.project }} is going to expire in less than 30 days.
Production Manual: https://skia.googlesource.com/buildbot/%2B/main/sa-keys-checker/PROD.md#sa_key_expiring_soon'
- alert: SAKeyExpired
expr: sa_key_expiration_s{app="sa-keys-checker"} < 0
labels:
category: infra
severity: warning
annotations:
abbr: '{{ $labels.sa }} - {{ $labels.key }}'
description: 'The key {{ $labels.key }} of {{ $labels.sa }} in project {{ $labels.project }} has expired.
Production Manual: https://skia.googlesource.com/buildbot/%2B/main/sa-keys-checker/PROD.md#sa_key_expired'
# Louhi continuous deployment pipeline.
- alert: CDImageBuildLatency
expr: cd_image_build_latency_s > 2*60*60
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
abbr: '{{ $labels.image }}'
description: 'No {{ $labels.image }} Docker image has been built for commit https://skia.googlesource.com/buildbot.git/+/{{ $labels.commit }}
Check the associated flow in https://louhi.dev/?projectId=6316342352543744#/home'
- alert: CDK8sConfigLatency
expr: cd_k8s_config_latency_s > 2*60*60
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
abbr: '{{ $labels.image }}'
description: 'No commit has landed to update the {{ $labels.image }} Docker image in the k8s-config repo for commit https://skia.googlesource.com/buildbot.git/+/{{ $labels.commit }}
Check the associated flow in https://louhi.dev/?projectId=6316342352543744#/home'
- alert: LastCDMetricsSuccess
expr: liveness_last_successful_cd_pipeline_metrics_s > 30*60
labels:
category: infra
severity: critical
owner: borenet@google.com
annotations:
abbr: '{{ $labels.app }}'
description: 'CD metrics ingestion on {{ $labels.app }} has not completed successfully.
https://pantheon.corp.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.container_name%3D%22datahopper-internal%22%0AsourceLocation.file%3D%22cd_metrics.go%22%0A?project=google.com:skia-corp'