alertserver/alerts.cfg - buildbot - Git at Google

 # This file defines alerts to be triggered by the server.

 #
 # AlertServer should tolerate no errors.

 [[rule]]
 name = "AlertServer Errors"
 message = "The Error rate for the alertserver is too high, please check the logs."
 database = "skmetrics"
 query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='alertserver' AND level='ERROR' AND host='skia-alerts' order by time desc limit 1"
 category = "infra"
 conditions = ["x > 0"]
 actions = ["Email(alerts@skia.org)"]
 auto-dismiss = true
 nag = "24h"

 #
 # SkiaGold
 #

 [[rule]]
 name = "Gold Alert (GM)"
 message = "At least one untriaged GM has been found. Please visit https://gold.skia.org/ to triage."
 database = "skmetrics"
 query = "select value from \"gold.status.by-corpus\" WHERE time > now() - 10m AND type='untriaged' AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
 category = "Gold"
 conditions = ["x > 0"]
 actions = ["Email(alerts@skia.org)"]
 auto-dismiss = true
 nag = "24h"

 [[rule]]
 name = "Expired Ignores (Gold)"
 message = "At least one expired ignore rule has been found. Please visit https://gold.skia.org/ignores to delete or extend."
 database = "skmetrics"
 query = "select value from \"gold.num-expired-ignore-rules\" WHERE time > now() - 10m AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
 category = "Gold"
 conditions = ["x > 0"]
 actions = ["Email(alerts@skia.org)"]
 auto-dismiss = true
 nag = "24h"

 [[rule]]
 name = "Ignore Monitoring Failure (Gold)"
 message = "At least two rounds of monitoring for expired ignore rules have failed back to back."
 database = "skmetrics"
 query = "select mean(value) from \"liveness\" where time > now() - 10m AND \"name\"='gold.expired-ignore-rules-monitoring' AND app='skiacorrectness' AND host='skia-gold-prod'"
 category = "infra"
 conditions = ["x >= 200"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"

 [[rule]]
 name = "Gold Error Rate"
 message = "The error rate for Gold is too high."
 database = "skmetrics"
 query = "select derivative(value, 10m) from \"logs\" WHERE time > now() - 10m AND \"name\"='skiacorrectness' AND level='ERROR' AND host='skia-gold-prod' order by time desc limit 1"
 category = "infra"
 conditions = ["x >= 1"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = false
 nag = "1h"

 #
 # Skia Alerts
 #

 [[rule]]
 name = "Skia Alerts Latency (JSON endpoint)"
 message = "The JSON endpoint at https://alerts.skia.org/json/alerts/ took more than 200ms to respond."
 database = "skmetrics"
 query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='skiaalerts_json'"
 category = "infra"
 conditions = ["x / 1000000 > 200"]
 actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
 auto-dismiss = true
 nag = "1h"

 #
 # Skolo
 #
 [[rule]]
 name = "skia-rpi-master-spare has been active for more than 10 minutes"
 message = "skia-rpi-master-spare has been active for more than 10 minutes. Something is probably wrong with skia-rpi-master.  go/skolo-maintenance"
 database = "skmetrics"
 query = "SELECT max(value) FROM \"skolo.hotspare.consecutive_failures\" WHERE time > now() - 10m AND app='hotspare'"
 category = "infra"
 conditions = ["x > 600"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"

 [[rule]]
 name = "The rpi image backup has not been backed up in some time"
 message = "The rpi image backup has not been backed up in at least 25 hours.  This should happen every day, but it hasn't.  go/skolo-maintenance"
 database = "skmetrics"
 query = "SELECT max(value) FROM \"skolo.rpi-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
 category = "infra"
 conditions = ["x < 100*1024*1024"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"

 [[rule]]
 name = "The router config has not been backed up in some time"
 message = "The router config has not been backed up in at least 25 hours.  This should happen every day, but it hasn't.  go/skolo-maintenance"
 database = "skmetrics"
 query = "SELECT max(value) FROM \"skolo.router-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
 category = "infra"
 conditions = ["x < 1024"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"

 [[rule]]
 name = "%(app)s on %(host)s is down or crashing a lot."
 message = "The skolo app %(app)s on %(host)s is down or is crashing a lot.  The average uptime for the last 20 minutes is below 10 minutes. go/skolo-maintenance"
 database = "skmetrics"
 query = "select mean(value) from liveness where time > now() - 20m AND host =~ /skia-rpi/ group by app, host"
 category = "infra"
 conditions = ["x < 10 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"

 #
 # Task Scheduler
 #

 [[rule]]
 name = "Task Scheduler Failing (%(host)s)"
 message = "The task scheduler has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#scheduling_failed"
 database = "skmetrics"
 query = "select mean(value) from liveness where time > now() - 5m AND app='task_scheduler' and \"name\"='last-successful-task-scheduling' group by app, host"
 category = "infra"
 conditions = ["x > 10 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "1h"

 [[rule]]
 name = "Task Scheduler HTTP Latency"
 message = "https://task-scheduler.skia.org took more than 300ms to respond. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#http_latency"
 database = "skmetrics"
 query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='task_scheduler'"
 category = "infra"
 conditions = ["x / 1000000 > 300"]
 actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
 auto-dismiss = true
 nag = "1h"

 [[rule]]
 name = "Task Scheduler Error Rate (%(host)s)"
 message = "The error rate for task_scheduler on %(host)s is too high. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#error_rate"
 database = "skmetrics"
 query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='task_scheduler' AND level='ERROR' group by host order by time desc limit 1"
 category = "infra"
 conditions = ["x >= 5"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = false
 nag = "1h"

 [[rule]]
 name = "Task Scheduler DB backup (%(host)s)"
 message = "The last Task Scheduler DB backup was more than 25 hours ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#old_db_backup"
 database = "skmetrics"
 query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND \"name\" = 'last-db-backup' group by host"
 category = "infra"
 conditions = ["x > 25 * 60 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler extra DB backups (%(host)s)"
 message = "There are too many recent Task Scheduler DB backups. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#too_many_recent_db_backups"
 database = "skmetrics"
 query = "select mean(value) from \"recent-db-backup-count\" where time > now() - 1h AND app='task_scheduler' group by host"
 category = "infra"
 conditions = ["x > 9"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler DB backup trigger (%(host)s)"
 message = "The last time we checked for a Task Scheduler DB backup trigger file was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
 database = "skmetrics"
 query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'db-backup-maybe-backup-db' group by host"
 category = "infra"
 conditions = ["x > 10 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler incremental backup (%(host)s)"
 message = "The last time a Task Scheduler incremental backup succeeded was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_liveness"
 database = "skmetrics"
 query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup' group by host"
 category = "infra"
 conditions = ["x > 10 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler incremental backup reset (%(host)s)"
 message = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_reset"
 database = "skmetrics"
 query = "select mean(value) from \"counter\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup-reset' group by host"
 category = "infra"
 conditions = ["x > 0"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler DB excess free pages (%(host)s)"
 message = "There are a large number of free pages in the Task Scheduler DB. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_too_many_free_pages"
 database = "skmetrics"
 query = "select mean(value) from \"db\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND metric='FreePageCount' group by host"
 category = "infra"
 conditions = ["x > 150"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler Nightly Trigger (%(host)s)"
 message = "The Task Scheduler's nightly trigger has not run in over 25 hours. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_nightly"
 database = "skmetrics"
 query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='nightly' group by host"
 category = "infra"
 conditions = ["x > 25 * 60 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"

 [[rule]]
 name = "Task Scheduler Weekly Trigger (%(host)s)"
 message = "The Task Scheduler's weekly trigger has not run in over 8 days. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_weekly"
 database = "skmetrics"
 query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='weekly' group by host"
 category = "infra"
 conditions = ["x > 8 * 24 * 60 * 60"]
 actions = ["Email(infra-alerts@skia.org)"]
 auto-dismiss = true
 nag = "6h"
	# This file defines alerts to be triggered by the server.

	#
	# AlertServer should tolerate no errors.

	[[rule]]
	name = "AlertServer Errors"
	message = "The Error rate for the alertserver is too high, please check the logs."
	database = "skmetrics"
	query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='alertserver' AND level='ERROR' AND host='skia-alerts' order by time desc limit 1"
	category = "infra"
	conditions = ["x > 0"]
	actions = ["Email(alerts@skia.org)"]
	auto-dismiss = true
	nag = "24h"

	#
	# SkiaGold
	#

	[[rule]]
	name = "Gold Alert (GM)"
	message = "At least one untriaged GM has been found. Please visit https://gold.skia.org/ to triage."
	database = "skmetrics"
	query = "select value from \"gold.status.by-corpus\" WHERE time > now() - 10m AND type='untriaged' AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
	category = "Gold"
	conditions = ["x > 0"]
	actions = ["Email(alerts@skia.org)"]
	auto-dismiss = true
	nag = "24h"

	[[rule]]
	name = "Expired Ignores (Gold)"
	message = "At least one expired ignore rule has been found. Please visit https://gold.skia.org/ignores to delete or extend."
	database = "skmetrics"
	query = "select value from \"gold.num-expired-ignore-rules\" WHERE time > now() - 10m AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
	category = "Gold"
	conditions = ["x > 0"]
	actions = ["Email(alerts@skia.org)"]
	auto-dismiss = true
	nag = "24h"

	[[rule]]
	name = "Ignore Monitoring Failure (Gold)"
	message = "At least two rounds of monitoring for expired ignore rules have failed back to back."
	database = "skmetrics"
	query = "select mean(value) from \"liveness\" where time > now() - 10m AND \"name\"='gold.expired-ignore-rules-monitoring' AND app='skiacorrectness' AND host='skia-gold-prod'"
	category = "infra"
	conditions = ["x >= 200"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "1h"

	[[rule]]
	name = "Gold Error Rate"
	message = "The error rate for Gold is too high."
	database = "skmetrics"
	query = "select derivative(value, 10m) from \"logs\" WHERE time > now() - 10m AND \"name\"='skiacorrectness' AND level='ERROR' AND host='skia-gold-prod' order by time desc limit 1"
	category = "infra"
	conditions = ["x >= 1"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = false
	nag = "1h"

	#
	# Skia Alerts
	#

	[[rule]]
	name = "Skia Alerts Latency (JSON endpoint)"
	message = "The JSON endpoint at https://alerts.skia.org/json/alerts/ took more than 200ms to respond."
	database = "skmetrics"
	query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='skiaalerts_json'"
	category = "infra"
	conditions = ["x / 1000000 > 200"]
	actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
	auto-dismiss = true
	nag = "1h"

	#
	# Skolo
	#
	[[rule]]
	name = "skia-rpi-master-spare has been active for more than 10 minutes"
	message = "skia-rpi-master-spare has been active for more than 10 minutes. Something is probably wrong with skia-rpi-master. go/skolo-maintenance"
	database = "skmetrics"
	query = "SELECT max(value) FROM \"skolo.hotspare.consecutive_failures\" WHERE time > now() - 10m AND app='hotspare'"
	category = "infra"
	conditions = ["x > 600"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "1h"

	[[rule]]
	name = "The rpi image backup has not been backed up in some time"
	message = "The rpi image backup has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance"
	database = "skmetrics"
	query = "SELECT max(value) FROM \"skolo.rpi-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
	category = "infra"
	conditions = ["x < 10010241024"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "1h"

	[[rule]]
	name = "The router config has not been backed up in some time"
	message = "The router config has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance"
	database = "skmetrics"
	query = "SELECT max(value) FROM \"skolo.router-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
	category = "infra"
	conditions = ["x < 1024"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "1h"

	[[rule]]
	name = "%(app)s on %(host)s is down or crashing a lot."
	message = "The skolo app %(app)s on %(host)s is down or is crashing a lot. The average uptime for the last 20 minutes is below 10 minutes. go/skolo-maintenance"
	database = "skmetrics"
	query = "select mean(value) from liveness where time > now() - 20m AND host =~ /skia-rpi/ group by app, host"
	category = "infra"
	conditions = ["x < 10 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "1h"

	#
	# Task Scheduler
	#

	[[rule]]
	name = "Task Scheduler Failing (%(host)s)"
	message = "The task scheduler has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#scheduling_failed"
	database = "skmetrics"
	query = "select mean(value) from liveness where time > now() - 5m AND app='task_scheduler' and \"name\"='last-successful-task-scheduling' group by app, host"
	category = "infra"
	conditions = ["x > 10 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "1h"

	[[rule]]
	name = "Task Scheduler HTTP Latency"
	message = "https://task-scheduler.skia.org took more than 300ms to respond. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#http_latency"
	database = "skmetrics"
	query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='task_scheduler'"
	category = "infra"
	conditions = ["x / 1000000 > 300"]
	actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
	auto-dismiss = true
	nag = "1h"

	[[rule]]
	name = "Task Scheduler Error Rate (%(host)s)"
	message = "The error rate for task_scheduler on %(host)s is too high. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#error_rate"
	database = "skmetrics"
	query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='task_scheduler' AND level='ERROR' group by host order by time desc limit 1"
	category = "infra"
	conditions = ["x >= 5"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = false
	nag = "1h"

	[[rule]]
	name = "Task Scheduler DB backup (%(host)s)"
	message = "The last Task Scheduler DB backup was more than 25 hours ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#old_db_backup"
	database = "skmetrics"
	query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND \"name\" = 'last-db-backup' group by host"
	category = "infra"
	conditions = ["x > 25 * 60 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler extra DB backups (%(host)s)"
	message = "There are too many recent Task Scheduler DB backups. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#too_many_recent_db_backups"
	database = "skmetrics"
	query = "select mean(value) from \"recent-db-backup-count\" where time > now() - 1h AND app='task_scheduler' group by host"
	category = "infra"
	conditions = ["x > 9"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler DB backup trigger (%(host)s)"
	message = "The last time we checked for a Task Scheduler DB backup trigger file was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
	database = "skmetrics"
	query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'db-backup-maybe-backup-db' group by host"
	category = "infra"
	conditions = ["x > 10 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler incremental backup (%(host)s)"
	message = "The last time a Task Scheduler incremental backup succeeded was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_liveness"
	database = "skmetrics"
	query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup' group by host"
	category = "infra"
	conditions = ["x > 10 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler incremental backup reset (%(host)s)"
	message = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_reset"
	database = "skmetrics"
	query = "select mean(value) from \"counter\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup-reset' group by host"
	category = "infra"
	conditions = ["x > 0"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler DB excess free pages (%(host)s)"
	message = "There are a large number of free pages in the Task Scheduler DB. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_too_many_free_pages"
	database = "skmetrics"
	query = "select mean(value) from \"db\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND metric='FreePageCount' group by host"
	category = "infra"
	conditions = ["x > 150"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler Nightly Trigger (%(host)s)"
	message = "The Task Scheduler's nightly trigger has not run in over 25 hours. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_nightly"
	database = "skmetrics"
	query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='nightly' group by host"
	category = "infra"
	conditions = ["x > 25 * 60 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"

	[[rule]]
	name = "Task Scheduler Weekly Trigger (%(host)s)"
	message = "The Task Scheduler's weekly trigger has not run in over 8 days. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_weekly"
	database = "skmetrics"
	query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='weekly' group by host"
	category = "infra"
	conditions = ["x > 8 * 24 * 60 * 60"]
	actions = ["Email(infra-alerts@skia.org)"]
	auto-dismiss = true
	nag = "6h"