| # See https://www.cockroachlabs.com/docs/v20.1/monitor-cockroachdb-with-prometheus.html for details on CockroachDB monitoring. |
| # |
| |
| groups: |
| - name: alerts_perf_cockroachdb.yml |
| rules: |
| # Alert for any instance that is unreachable for >5 minutes. |
| - alert: InstanceDown |
| expr: up{app="perf-cockroachdb"} == 0 |
| for: 5m |
| annotations: |
| description: |
| '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been |
| down for more than 5 minutes.' |
| summary: Instance {{ $labels.instance }} down |
| # Alert for any instance that is unreachable for >15 minutes. |
| - alert: InstanceDead |
| expr: up{app="perf-cockroachdb"} == 0 |
| for: 15m |
| annotations: |
| description: |
| '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been |
| down for more than 15 minutes.' |
| summary: Instance {{ $labels.instance }} dead |
| # Alert on instance restarts. |
| - alert: InstanceRestart |
| expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 0 and resets(sys_uptime{app="perf-cockroachdb"}[10m]) < 5 |
| annotations: |
| description: |
| '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted |
| {{ $value }} time(s) in 10m' |
| summary: Instance {{ $labels.instance }} restarted |
| # Alert on flapping instances (frequent restarts). |
| - alert: InstanceFlapping |
| expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 5 |
| annotations: |
| description: |
| '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted |
| {{ $value }} time(s) in 10m' |
| summary: Instance {{ $labels.instance }} flapping |
| # Alert on version mismatch. |
| # This alert is intentionally loose (30 minutes) to allow for rolling upgrades. |
| # This may need to be adjusted for large clusters. |
| - alert: VersionMismatch |
| expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{app="perf-cockroachdb"})) > 1 |
| for: 30m |
| annotations: |
| description: Cluster {{ $labels.cluster }} running {{ $value }} different versions |
| summary: Binary version mismatch on {{ $labels.cluster }} |
| # Available capacity alerts. |
| - alert: StoreDiskLow |
| expr: capacity_available:ratio{app="perf-cockroachdb"} < 0.15 |
| annotations: |
| summary: |
| Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value |
| }} available disk fraction |
| - alert: ClusterDiskLow |
| expr: cluster:capacity_available:ratio{app="perf-cockroachdb"} < 0.2 |
| annotations: |
| summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction |
| # Zero SQL qps. |
| - alert: ZeroSQLQps |
| expr: sql_conns{app="perf-cockroachdb"} > 0 and rate(sql_query_count{app="perf-cockroachdb"}[5m]) == 0 |
| for: 10m |
| annotations: |
| summary: Instance {{ $labels.instance }} has SQL connections but no queries |
| # Unavailable ranges. |
| - alert: UnavailableRanges |
| expr: (sum by(instance, cluster) (ranges_unavailable{app="perf-cockroachdb"})) > 0 |
| for: 10m |
| labels: |
| severity: testing |
| annotations: |
| summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges |
| # Leader-not-leaseholder ranges. |
| - alert: NoLeaseRanges |
| expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{app="perf-cockroachdb"})) > 0 |
| for: 10m |
| labels: |
| severity: testing |
| annotations: |
| summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases |
| # Getting close to open file descriptor limit. |
| - alert: HighOpenFDCount |
| expr: sys_fd_open{app="perf-cockroachdb"} / sys_fd_softlimit{app="perf-cockroachdb"} > 0.8 |
| for: 10m |
| labels: |
| severity: testing |
| annotations: |
| summary: |
| 'Too many open file descriptors on {{ $labels.instance }}: {{ $value |
| }} fraction used' |