blob: 6c90cb2106135759db9ce81cfd1d762a1ce817e7 [file] [log] [blame]
# This file defines alerts to be triggered by the server.
#
# AlertServer should tolerate no errors.
[[rule]]
name = "AlertServer Errors"
message = "The Error rate for the alertserver is too high, please check the logs."
database = "skmetrics"
query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='alertserver' AND level='ERROR' AND host='skia-alerts' order by time desc limit 1"
category = "infra"
conditions = ["x > 0"]
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
#
# SkiaGold
#
[[rule]]
name = "Gold Alert (GM)"
message = "At least one untriaged GM has been found. Please visit https://gold.skia.org/ to triage."
database = "skmetrics"
query = "select value from \"gold.status.by-corpus\" WHERE time > now() - 10m AND type='untriaged' AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
category = "Gold"
conditions = ["x > 0"]
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
[[rule]]
name = "Expired Ignores (Gold)"
message = "At least one expired ignore rule has been found. Please visit https://gold.skia.org/ignores to delete or extend."
database = "skmetrics"
query = "select value from \"gold.num-expired-ignore-rules\" WHERE time > now() - 10m AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1"
category = "Gold"
conditions = ["x > 0"]
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
[[rule]]
name = "Ignore Monitoring Failure (Gold)"
message = "At least two rounds of monitoring for expired ignore rules have failed back to back."
database = "skmetrics"
query = "select mean(value) from \"liveness\" where time > now() - 10m AND \"name\"='gold.expired-ignore-rules-monitoring' AND app='skiacorrectness' AND host='skia-gold-prod'"
category = "infra"
conditions = ["x >= 200"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "Gold Error Rate"
message = "The error rate for Gold is too high."
database = "skmetrics"
query = "select derivative(value, 10m) from \"logs\" WHERE time > now() - 10m AND \"name\"='skiacorrectness' AND level='ERROR' AND host='skia-gold-prod' order by time desc limit 1"
category = "infra"
conditions = ["x >= 1"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
#
# Skia Alerts
#
[[rule]]
name = "Skia Alerts Latency (JSON endpoint)"
message = "The JSON endpoint at https://alerts.skia.org/json/alerts/ took more than 200ms to respond."
database = "skmetrics"
query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='skiaalerts_json'"
category = "infra"
conditions = ["x / 1000000 > 200"]
actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"]
auto-dismiss = true
nag = "1h"
#
# Skolo
#
[[rule]]
name = "skia-rpi-master-spare has been active for more than 10 minutes"
message = "skia-rpi-master-spare has been active for more than 10 minutes. Something is probably wrong with skia-rpi-master. go/skolo-maintenance"
database = "skmetrics"
query = "SELECT max(value) FROM \"skolo.hotspare.consecutive_failures\" WHERE time > now() - 10m AND app='hotspare'"
category = "infra"
conditions = ["x > 600"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "The rpi image backup has not been backed up in some time"
message = "The rpi image backup has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance"
database = "skmetrics"
query = "SELECT max(value) FROM \"skolo.rpi-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
category = "infra"
conditions = ["x < 100*1024*1024"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "The router config has not been backed up in some time"
message = "The router config has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance"
database = "skmetrics"
query = "SELECT max(value) FROM \"skolo.router-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'"
category = "infra"
conditions = ["x < 1024"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "%(app)s on %(host)s is down or crashing a lot."
message = "The skolo app %(app)s on %(host)s is down or is crashing a lot. The average uptime for the last 20 minutes is below 10 minutes. go/skolo-maintenance"
database = "skmetrics"
query = "select mean(value) from liveness where time > now() - 20m AND host =~ /skia-rpi/ group by app, host"
category = "infra"
conditions = ["x < 10 * 60"]
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"