blob: f4786f1456b3334964c99f38442c7b35f21d9ca3 [file] [log] [blame]
# Alerts for Thanos, which we fire from both thanos and the skia-public
# prometheus instance for safety.
#
groups:
- name: thanos
rules:
- alert: PrometheusRuleEvaluationFailures
expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.instance }}'
description: 'Thanos/Prometheus is failing to evaluate rules, check {{ $labels.kubernetes_pod_name }} pod logs'
- alert: ThanosRuleIsDroppingAlerts
expr: rate(thanos_alert_queue_alerts_dropped_total[5m]) > 0
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.instance }}'
description: 'Thanos Rule is dropping alerts, check {{ $labels.kubernetes_pod_name }} pod logs'
- alert: ThanosRuleGrpcErrorRate
expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable"}[5m]) > 0
for: 5m
labels:
category: infra
severity: critical
annotations:
abbr: '{{ $labels.instance }}'
description: Thanos Rule is returning Internal/Unavailable errors, check {{ $labels.kubernetes_pod_name }} pod logs.