promk/prometheus/alerts_thanos.yml - buildbot - Git at Google

 # Alerts for Thanos, which we fire from both thanos and the skia-public
 # prometheus instance for safety.
 #
 groups:
 - name: thanos
   rules:
   - alert: PrometheusRuleEvaluationFailures
     expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.instance }}'
       description: 'Thanos/Prometheus is failing to evaluate rules, check {{ $labels.kubernetes_pod_name }} pod logs'
   - alert: ThanosRuleIsDroppingAlerts
     expr: rate(thanos_alert_queue_alerts_dropped_total[5m]) > 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.instance }}'
       description: 'Thanos Rule is dropping alerts, check {{ $labels.kubernetes_pod_name }} pod logs'
   - alert: ThanosRuleGrpcErrorRate
     expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable"}[5m]) > 0
     for: 5m
     labels:
       category: infra
       severity: critical
     annotations:
       abbr: '{{ $labels.instance }}'
       description: Thanos Rule is returning Internal/Unavailable errors, check {{ $labels.kubernetes_pod_name }} pod logs.
	# Alerts for Thanos, which we fire from both thanos and the skia-public
	# prometheus instance for safety.
	#
	groups:
	- name: thanos
	rules:
	- alert: PrometheusRuleEvaluationFailures
	expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.instance }}'
	description: 'Thanos/Prometheus is failing to evaluate rules, check {{ $labels.kubernetes_pod_name }} pod logs'
	- alert: ThanosRuleIsDroppingAlerts
	expr: rate(thanos_alert_queue_alerts_dropped_total[5m]) > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.instance }}'
	description: 'Thanos Rule is dropping alerts, check {{ $labels.kubernetes_pod_name }} pod logs'
	- alert: ThanosRuleGrpcErrorRate
	expr: rate(grpc_server_handled_total{grpc_code=~"Unknown\|ResourceExhausted\|Internal\|Unavailable"}[5m]) > 0
	for: 5m
	labels:
	category: infra
	severity: critical
	annotations:
	abbr: '{{ $labels.instance }}'
	description: Thanos Rule is returning Internal/Unavailable errors, check {{ $labels.kubernetes_pod_name }} pod logs.