| # Alerts for things in the skia-corp cluster only. |
| groups: |
| - name: general |
| rules: |
| - alert: InternalAutoRoll |
| expr: autoroll_last_roll_result{roller="skia-internal-autoroll"} == 0 |
| for: 10m |
| labels: |
| category: infra |
| severity: warning |
| owner: borenet@google.com |
| annotations: |
| description: 'The last DEPS roll into internal_test repo failed. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#autoroll_failed' |
| |
| - alert: InternalAutoRoll24H |
| expr: liveness_last_autoroll_landed_s{roller="skia-internal-autoroll"}/60/60 > 24 |
| labels: |
| category: infra |
| severity: warning |
| owner: borenet@google.com |
| annotations: |
| description: 'The last-landed roll into internal_test was over 24h ago. https://skia.googlesource.com/buildbot/%2B/main/autoroll/PROD.md#no_rolls_24h' |
| |
| - alert: TrybotUpdaterErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"trybot-updater.*"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate on {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| - alert: SkCQBackendErrorRate |
| expr: rate(num_log_lines{level="ERROR",app=~"skcq-be"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate on {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| # Service account keys checker |
| |
| - alert: SAKeysCheckerErrorRate |
| expr: rate(num_log_lines{level="ERROR",app="sa-keys-checker"}[1h]) > 0.001 |
| labels: |
| category: infra |
| severity: critical |
| owner: rmistry@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'The error rate on {{ $labels.app }} is too high. |
| https://console.cloud.google.com/logs/viewer?project={{ $labels.project }}&resource=k8s_container%2Fcluster_name%2F{{ $labels.cluster }}%2Fnamespace_name%2Fdefault%2Fcontainer_name%2F{{ $labels.app }}' |
| |
| |
| - alert: SAKeyExpiringSoon |
| expr: 0 < sa_key_expiration_s{app="sa-keys-checker"} < 30*24*60*60 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.sa }} - {{ $labels.key }}' |
| description: 'The key {{ $labels.key }} of {{ $labels.sa }} in project {{ $labels.project }} is going to expire in less than 30 days. |
| Production Manual: https://skia.googlesource.com/buildbot/%2B/main/sa-keys-checker/PROD.md#sa_key_expiring_soon' |
| |
| - alert: SAKeyExpired |
| expr: sa_key_expiration_s{app="sa-keys-checker"} < 0 |
| labels: |
| category: infra |
| severity: warning |
| annotations: |
| abbr: '{{ $labels.sa }} - {{ $labels.key }}' |
| description: 'The key {{ $labels.key }} of {{ $labels.sa }} in project {{ $labels.project }} has expired. |
| Production Manual: https://skia.googlesource.com/buildbot/%2B/main/sa-keys-checker/PROD.md#sa_key_expired' |
| |
| # Louhi continuous deployment pipeline. |
| - alert: CDImageBuildLatency |
| expr: cd_image_build_latency_s > 2*60*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.image }}' |
| description: 'No {{ $labels.image }} Docker image has been built for commit https://skia.googlesource.com/buildbot.git/+/{{ $labels.commit }} |
| Check the associated flow in https://louhi.dev/?projectId=6316342352543744#/home' |
| |
| - alert: CDK8sConfigLatency |
| expr: cd_k8s_config_latency_s > 2*60*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.image }}' |
| description: 'No commit has landed to update the {{ $labels.image }} Docker image in the k8s-config repo for commit https://skia.googlesource.com/buildbot.git/+/{{ $labels.commit }} |
| Check the associated flow in https://louhi.dev/?projectId=6316342352543744#/home' |
| |
| - alert: LastCDMetricsSuccess |
| expr: liveness_last_successful_cd_pipeline_metrics_s > 30*60 |
| labels: |
| category: infra |
| severity: critical |
| owner: borenet@google.com |
| annotations: |
| abbr: '{{ $labels.app }}' |
| description: 'CD metrics ingestion on {{ $labels.app }} has not completed successfully. |
| https://pantheon.corp.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.container_name%3D%22datahopper-internal%22%0AsourceLocation.file%3D%22cd_metrics.go%22%0A?project=google.com:skia-corp' |