| # Alerts for Thanos, which we fire from both thanos and the skia-public |
| # prometheus instance for safety. |
| # |
| groups: |
| - name: thanos |
| rules: |
| - alert: PrometheusRuleEvaluationFailures |
| expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: 'Thanos/Prometheus is failing to evaluate rules, check {{ $labels.kubernetes_pod_name }} pod logs' |
| - alert: ThanosRuleIsDroppingAlerts |
| expr: rate(thanos_alert_queue_alerts_dropped_total[5m]) > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: 'Thanos Rule is dropping alerts, check {{ $labels.kubernetes_pod_name }} pod logs' |
| - alert: ThanosRuleGrpcErrorRate |
| expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable"}[5m]) > 0 |
| for: 5m |
| labels: |
| category: infra |
| severity: critical |
| annotations: |
| abbr: '{{ $labels.instance }}' |
| description: Thanos Rule is returning Internal/Unavailable errors, check {{ $labels.kubernetes_pod_name }} pod logs. |