blob: db0524e96d22a0e504b663ec7d08a3552d94a11a [file] [log] [blame]
# See https://www.cockroachlabs.com/docs/v20.1/monitor-cockroachdb-with-prometheus.html for details on CockroachDB monitoring.
#
groups:
- name: alerts_perf_cockroachdb.yml
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up{app="perf-cockroachdb"} == 0
for: 5m
annotations:
description:
'{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
down for more than 5 minutes.'
summary: Instance {{ $labels.instance }} down
# Alert for any instance that is unreachable for >15 minutes.
- alert: InstanceDead
expr: up{app="perf-cockroachdb"} == 0
for: 15m
annotations:
description:
'{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
down for more than 15 minutes.'
summary: Instance {{ $labels.instance }} dead
# Alert on instance restarts.
- alert: InstanceRestart
expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 0 and resets(sys_uptime{app="perf-cockroachdb"}[10m]) < 5
annotations:
description:
'{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 10m'
summary: Instance {{ $labels.instance }} restarted
# Alert on flapping instances (frequent restarts).
- alert: InstanceFlapping
expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 5
annotations:
description:
'{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 10m'
summary: Instance {{ $labels.instance }} flapping
# Alert on version mismatch.
# This alert is intentionally loose (30 minutes) to allow for rolling upgrades.
# This may need to be adjusted for large clusters.
- alert: VersionMismatch
expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{app="perf-cockroachdb"})) > 1
for: 30m
annotations:
description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
summary: Binary version mismatch on {{ $labels.cluster }}
# Available capacity alerts.
- alert: StoreDiskLow
expr: capacity_available:ratio{app="perf-cockroachdb"} < 0.15
annotations:
summary:
Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
}} available disk fraction
- alert: ClusterDiskLow
expr: cluster:capacity_available:ratio{app="perf-cockroachdb"} < 0.2
annotations:
summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
# Zero SQL qps.
- alert: ZeroSQLQps
expr: sql_conns{app="perf-cockroachdb"} > 0 and rate(sql_query_count{app="perf-cockroachdb"}[5m]) == 0
for: 10m
annotations:
summary: Instance {{ $labels.instance }} has SQL connections but no queries
# Unavailable ranges.
- alert: UnavailableRanges
expr: (sum by(instance, cluster) (ranges_unavailable{app="perf-cockroachdb"})) > 0
for: 10m
labels:
severity: testing
annotations:
summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
# Leader-not-leaseholder ranges.
- alert: NoLeaseRanges
expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{app="perf-cockroachdb"})) > 0
for: 10m
labels:
severity: testing
annotations:
summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases
# Getting close to open file descriptor limit.
- alert: HighOpenFDCount
expr: sys_fd_open{app="perf-cockroachdb"} / sys_fd_softlimit{app="perf-cockroachdb"} > 0.8
for: 10m
labels:
severity: testing
annotations:
summary:
'Too many open file descriptors on {{ $labels.instance }}: {{ $value
}} fraction used'