promk/prometheus/alerts_perf_cockroachdb.yml - buildbot - Git at Google

 # See https://www.cockroachlabs.com/docs/v20.1/monitor-cockroachdb-with-prometheus.html for details on CockroachDB monitoring.
 #

 groups:
   - name: alerts_perf_cockroachdb.yml
     rules:
       # Alert for any instance that is unreachable for >5 minutes.
       - alert: InstanceDown
         expr: up{app="perf-cockroachdb"} == 0
         for: 5m
         annotations:
           description:
             '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
             down for more than 5 minutes.'
           summary: Instance {{ $labels.instance }} down
       # Alert for any instance that is unreachable for >15 minutes.
       - alert: InstanceDead
         expr: up{app="perf-cockroachdb"} == 0
         for: 15m
         annotations:
           description:
             '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
             down for more than 15 minutes.'
           summary: Instance {{ $labels.instance }} dead
       # Alert on instance restarts.
       - alert: InstanceRestart
         expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 0 and resets(sys_uptime{app="perf-cockroachdb"}[10m]) < 5
         annotations:
           description:
             '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
             {{ $value }} time(s) in 10m'
           summary: Instance {{ $labels.instance }} restarted
       # Alert on flapping instances (frequent restarts).
       - alert: InstanceFlapping
         expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 5
         annotations:
           description:
             '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
             {{ $value }} time(s) in 10m'
           summary: Instance {{ $labels.instance }} flapping
       # Alert on version mismatch.
       # This alert is intentionally loose (30 minutes) to allow for rolling upgrades.
       # This may need to be adjusted for large clusters.
       - alert: VersionMismatch
         expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{app="perf-cockroachdb"})) > 1
         for: 30m
         annotations:
           description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
           summary: Binary version mismatch on {{ $labels.cluster }}
       # Available capacity alerts.
       - alert: StoreDiskLow
         expr: capacity_available:ratio{app="perf-cockroachdb"} < 0.15
         annotations:
           summary:
             Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
             }} available disk fraction
       - alert: ClusterDiskLow
         expr: cluster:capacity_available:ratio{app="perf-cockroachdb"} < 0.2
         annotations:
           summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
       # Zero SQL qps.
       - alert: ZeroSQLQps
         expr: sql_conns{app="perf-cockroachdb"} > 0 and rate(sql_query_count{app="perf-cockroachdb"}[5m]) == 0
         for: 10m
         annotations:
           summary: Instance {{ $labels.instance }} has SQL connections but no queries
       # Unavailable ranges.
       - alert: UnavailableRanges
         expr: (sum by(instance, cluster) (ranges_unavailable{app="perf-cockroachdb"})) > 0
         for: 10m
         labels:
           severity: testing
         annotations:
           summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
       # Leader-not-leaseholder ranges.
       - alert: NoLeaseRanges
         expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{app="perf-cockroachdb"})) > 0
         for: 10m
         labels:
           severity: testing
         annotations:
           summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases
       # Getting close to open file descriptor limit.
       - alert: HighOpenFDCount
         expr: sys_fd_open{app="perf-cockroachdb"} / sys_fd_softlimit{app="perf-cockroachdb"} > 0.8
         for: 10m
         labels:
           severity: testing
         annotations:
           summary:
             'Too many open file descriptors on {{ $labels.instance }}: {{ $value
             }} fraction used'
	# See https://www.cockroachlabs.com/docs/v20.1/monitor-cockroachdb-with-prometheus.html for details on CockroachDB monitoring.
	#

	groups:
	- name: alerts_perf_cockroachdb.yml
	rules:
	# Alert for any instance that is unreachable for >5 minutes.
	- alert: InstanceDown
	expr: up{app="perf-cockroachdb"} == 0
	for: 5m
	annotations:
	description:
	'{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
	down for more than 5 minutes.'
	summary: Instance {{ $labels.instance }} down
	# Alert for any instance that is unreachable for >15 minutes.
	- alert: InstanceDead
	expr: up{app="perf-cockroachdb"} == 0
	for: 15m
	annotations:
	description:
	'{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
	down for more than 15 minutes.'
	summary: Instance {{ $labels.instance }} dead
	# Alert on instance restarts.
	- alert: InstanceRestart
	expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 0 and resets(sys_uptime{app="perf-cockroachdb"}[10m]) < 5
	annotations:
	description:
	'{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
	{{ $value }} time(s) in 10m'
	summary: Instance {{ $labels.instance }} restarted
	# Alert on flapping instances (frequent restarts).
	- alert: InstanceFlapping
	expr: resets(sys_uptime{app="perf-cockroachdb"}[10m]) > 5
	annotations:
	description:
	'{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
	{{ $value }} time(s) in 10m'
	summary: Instance {{ $labels.instance }} flapping
	# Alert on version mismatch.
	# This alert is intentionally loose (30 minutes) to allow for rolling upgrades.
	# This may need to be adjusted for large clusters.
	- alert: VersionMismatch
	expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{app="perf-cockroachdb"})) > 1
	for: 30m
	annotations:
	description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
	summary: Binary version mismatch on {{ $labels.cluster }}
	# Available capacity alerts.
	- alert: StoreDiskLow
	expr: capacity_available:ratio{app="perf-cockroachdb"} < 0.15
	annotations:
	summary:
	Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
	}} available disk fraction
	- alert: ClusterDiskLow
	expr: cluster:capacity_available:ratio{app="perf-cockroachdb"} < 0.2
	annotations:
	summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
	# Zero SQL qps.
	- alert: ZeroSQLQps
	expr: sql_conns{app="perf-cockroachdb"} > 0 and rate(sql_query_count{app="perf-cockroachdb"}[5m]) == 0
	for: 10m
	annotations:
	summary: Instance {{ $labels.instance }} has SQL connections but no queries
	# Unavailable ranges.
	- alert: UnavailableRanges
	expr: (sum by(instance, cluster) (ranges_unavailable{app="perf-cockroachdb"})) > 0
	for: 10m
	labels:
	severity: testing
	annotations:
	summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
	# Leader-not-leaseholder ranges.
	- alert: NoLeaseRanges
	expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{app="perf-cockroachdb"})) > 0
	for: 10m
	labels:
	severity: testing
	annotations:
	summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases
	# Getting close to open file descriptor limit.
	- alert: HighOpenFDCount
	expr: sys_fd_open{app="perf-cockroachdb"} / sys_fd_softlimit{app="perf-cockroachdb"} > 0.8
	for: 10m
	labels:
	severity: testing
	annotations:
	summary:
	'Too many open file descriptors on {{ $labels.instance }}: {{ $value
	}} fraction used'