blob: eef94f3ed1ef91271d577a1259dd15e3bc1fba84 [file] [log] [blame]
# This file defines alerts to be triggered by the server.
#
# AlertServer should tolerate no errors.
[[rule]]
name = "AlertServer Errors"
message = "The Error rate for the alertserver is too high, please check the logs."
database = "skmetrics"
query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='alertserver' AND level='ERROR' AND host='skia-alerts' order by time desc limit 1"
category = "infra"
conditions = ["x > 0"]
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
#
# SkiaGold
#
[[rule]]
name = "Gold Alert (GM)"
message = "At least one untriaged GM has been found. Please visit https://gold.skia.org/ to triage."
database = "skmetrics"
query = "select value from \"gold.status.by-corpus\" WHERE time > now() - 10m AND type='untriaged' AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
category = "Gold"
conditions = ["x > 0"]
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
[[rule]]
name = "Expired Ignores (Gold)"
message = "At least one expired ignore rule has been found. Please visit https://gold.skia.org/ignores to delete or extend."
database = "skmetrics"
query = "select value from \"gold.num-expired-ignore-rules\" WHERE time > now() - 10m AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
category = "Gold"
conditions = ["x > 0"]
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
[[rule]]
name = "Ignore Monitoring Failure (Gold)"
message = "At least two rounds of monitoring for expired ignore rules have failed back to back."
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 10m AND \"name\"='gold.expired-ignore-rules-monitoring' AND app='skiacorrectness' AND host='skia-gold-prod'"
category = "infra"
conditions = ["x >= 200"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "Gold Error Rate"
message = "The error rate for Gold is too high."
database = "skmetrics"
query = "select derivative(value, 10m) from \"logs\" WHERE time > now() - 10m AND \"name\"='skiacorrectness' AND level='ERROR' AND host='skia-gold-prod' order by time desc limit 1"
category = "infra"
conditions = ["x >= 1"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
#
# Skia Alerts
#
[[rule]]
name = "Skia Alerts Latency (JSON endpoint)"
message = "The JSON endpoint at https://alerts.skia.org/json/alerts/ took more than 200ms to respond."
database = "skmetrics"
query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='skiaalerts_json'"
category = "infra"
conditions = ["x / 1000000 > 200"]
actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
auto-dismiss = true
nag = "1h"
#
# Skolo
#
[[rule]]
name = "skia-rpi-master-spare has been active for more than 10 minutes"
message = "skia-rpi-master-spare has been active for more than 10 minutes. Something is probably wrong with skia-rpi-master. go/skolo-maintenance"
database = "skmetrics"
query = "SELECT max(value) FROM \"skolo.hotspare.consecutive_failures\" WHERE time > now() - 10m AND app='hotspare'"
category = "infra"
conditions = ["x > 600"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "The rpi image backup has not been backed up in some time"
message = "The rpi image backup has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance"
database = "skmetrics"
query = "SELECT max(value) FROM \"skolo.rpi-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
category = "infra"
conditions = ["x < 100*1024*1024"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "The router config has not been backed up in some time"
message = "The router config has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance"
database = "skmetrics"
query = "SELECT max(value) FROM \"skolo.router-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
category = "infra"
conditions = ["x < 1024"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "%(app)s on %(host)s is down or crashing a lot."
message = "The skolo app %(app)s on %(host)s is down or is crashing a lot. The average uptime for the last 20 minutes is below 10 minutes. go/skolo-maintenance"
database = "skmetrics"
query = "select mean(value) from liveness where time > now() - 20m AND host =~ /skia-rpi/ group by app, host"
category = "infra"
conditions = ["x < 10 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
#
# Task Scheduler
#
[[rule]]
name = "Task Scheduler Failing (%(host)s)"
message = "The task scheduler has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#scheduling_failed"
database = "skmetrics"
query = "select mean(value) from liveness where time > now() - 5m AND app='task_scheduler' and \"name\"='last-successful-task-scheduling' group by app, host"
category = "infra"
conditions = ["x > 10 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "Task Scheduler HTTP Latency"
message = "https://task-scheduler.skia.org took more than 300ms to respond. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#http_latency"
database = "skmetrics"
query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='task_scheduler'"
category = "infra"
conditions = ["x / 1000000 > 300"]
actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "Task Scheduler Error Rate (%(host)s)"
message = "The error rate for task_scheduler on %(host)s is too high. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#error_rate"
database = "skmetrics"
query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='task_scheduler' AND level='ERROR' group by host order by time desc limit 1"
category = "infra"
conditions = ["x >= 5"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
[[rule]]
name = "Task Scheduler DB backup (%(host)s)"
message = "The last Task Scheduler DB backup was more than 25 hours ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#old_db_backup"
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND \"name\" = 'last-db-backup' group by host"
category = "infra"
conditions = ["x > 25 * 60 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler extra DB backups (%(host)s)"
message = "There are too many recent Task Scheduler DB backups. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#too_many_recent_db_backups"
database = "skmetrics"
query = "select mean(value) from \"recent-db-backup-count\" where time > now() - 1h AND app='task_scheduler' group by host"
category = "infra"
conditions = ["x > 9"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler DB backup trigger (%(host)s)"
message = "The last time we checked for a Task Scheduler DB backup trigger file was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'db-backup-maybe-backup-db' group by host"
category = "infra"
conditions = ["x > 10 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler incremental backup (%(host)s)"
message = "The last time a Task Scheduler incremental backup succeeded was more than 10 minutes ago. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_liveness"
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup' group by host"
category = "infra"
conditions = ["x > 10 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler incremental backup reset (%(host)s)"
message = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#incremental_backup_reset"
database = "skmetrics"
query = "select mean(value) from \"counter\" where time > now() - 5m AND \"database\" = 'task_scheduler_db' AND \"name\" = 'incremental-backup-reset' group by host"
category = "infra"
conditions = ["x > 0"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler DB excess free pages (%(host)s)"
message = "There are a large number of free pages in the Task Scheduler DB. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#db_too_many_free_pages"
database = "skmetrics"
query = "select mean(value) from \"db\" where time > now() - 1h AND \"database\" = 'task_scheduler_db' AND metric='FreePageCount' group by host"
category = "infra"
conditions = ["x > 150"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler Nightly Trigger (%(host)s)"
message = "The Task Scheduler's nightly trigger has not run in over 25 hours. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_nightly"
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='nightly' group by host"
category = "infra"
conditions = ["x > 25 * 60 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"
[[rule]]
name = "Task Scheduler Weekly Trigger (%(host)s)"
message = "The Task Scheduler's weekly trigger has not run in over 8 days. https://skia.googlesource.com/buildbot/+/master/task_scheduler/PROD.md#trigger_weekly"
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 1h AND \"name\" = 'task-scheduler-periodic-trigger' AND trigger='weekly' group by host"
category = "infra"
conditions = ["x > 8 * 24 * 60 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "6h"