| # This file defines alerts to be triggered by the server. |
| |
| # |
| # AlertServer should tolerate no errors. |
| |
| [[rule]] |
| name = "AlertServer Errors" |
| message = "The Error rate for the alertserver is too high, please check the logs." |
| database = "skmetrics" |
| query = "select derivative(value, 10m) from \"logs\" where time > now() - 10m AND \"name\"='alertserver' AND level='ERROR' AND host='skia-alerts' order by time desc limit 1" |
| category = "infra" |
| conditions = ["x > 0"] |
| actions = ["Email(alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "24h" |
| |
| # |
| # SkiaGold |
| # |
| |
| [[rule]] |
| name = "Gold Alert (GM)" |
| message = "At least one untriaged GM has been found. Please visit https://gold.skia.org/ to triage." |
| database = "skmetrics" |
| query = "select value from \"gold.status.by-corpus\" WHERE time > now() - 10m AND type='untriaged' AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1" |
| category = "Gold" |
| conditions = ["x > 0"] |
| actions = ["Email(alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "24h" |
| |
| [[rule]] |
| name = "Expired Ignores (Gold)" |
| message = "At least one expired ignore rule has been found. Please visit https://gold.skia.org/ignores to delete or extend." |
| database = "skmetrics" |
| query = "select value from \"gold.num-expired-ignore-rules\" WHERE time > now() - 10m AND app = 'skiacorrectness' AND host = 'skia-gold-prod' order by time desc limit 1" |
| category = "Gold" |
| conditions = ["x > 0"] |
| actions = ["Email(alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "24h" |
| |
| [[rule]] |
| name = "Ignore Monitoring Failure (Gold)" |
| message = "At least two rounds of monitoring for expired ignore rules have failed back to back." |
| database = "skmetrics" |
| query = "select mean(value) from \"liveness\" where time > now() - 10m AND \"name\"='gold.expired-ignore-rules-monitoring' AND app='skiacorrectness' AND host='skia-gold-prod'" |
| category = "infra" |
| conditions = ["x >= 200"] |
| actions = ["Email(infra-alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "1h" |
| |
| [[rule]] |
| name = "Gold Error Rate" |
| message = "The error rate for Gold is too high." |
| database = "skmetrics" |
| query = "select derivative(value, 10m) from \"logs\" WHERE time > now() - 10m AND \"name\"='skiacorrectness' AND level='ERROR' AND host='skia-gold-prod' order by time desc limit 1" |
| category = "infra" |
| conditions = ["x >= 1"] |
| actions = ["Email(infra-alerts@skia.org)"] |
| auto-dismiss = false |
| nag = "1h" |
| |
| # |
| # Skia Alerts |
| # |
| |
| [[rule]] |
| name = "Skia Alerts Latency (JSON endpoint)" |
| message = "The JSON endpoint at https://alerts.skia.org/json/alerts/ took more than 200ms to respond." |
| database = "skmetrics" |
| query = "select mean(value) from \"prober\" where time > now() - 10m AND type='latency' AND probename='skiaalerts_json'" |
| category = "infra" |
| conditions = ["x / 1000000 > 200"] |
| actions = ["Email(infra-alerts@skia.org)", "Email(borenet@google.com)"] |
| auto-dismiss = true |
| nag = "1h" |
| |
| # |
| # Skolo |
| # |
| [[rule]] |
| name = "skia-rpi-master-spare has been active for more than 10 minutes" |
| message = "skia-rpi-master-spare has been active for more than 10 minutes. Something is probably wrong with skia-rpi-master. go/skolo-maintenance" |
| database = "skmetrics" |
| query = "SELECT max(value) FROM \"skolo.hotspare.consecutive_failures\" WHERE time > now() - 10m AND app='hotspare'" |
| category = "infra" |
| conditions = ["x > 600"] |
| actions = ["Email(infra-alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "1h" |
| |
| [[rule]] |
| name = "The rpi image backup has not been backed up in some time" |
| message = "The rpi image backup has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance" |
| database = "skmetrics" |
| query = "SELECT max(value) FROM \"skolo.rpi-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'" |
| category = "infra" |
| conditions = ["x < 100*1024*1024"] |
| actions = ["Email(infra-alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "1h" |
| |
| [[rule]] |
| name = "The router config has not been backed up in some time" |
| message = "The router config has not been backed up in at least 25 hours. This should happen every day, but it hasn't. go/skolo-maintenance" |
| database = "skmetrics" |
| query = "SELECT max(value) FROM \"skolo.router-backup.backup-size\" WHERE time > now() - 25h AND app='file-backup'" |
| category = "infra" |
| conditions = ["x < 1024"] |
| actions = ["Email(infra-alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "1h" |
| |
| [[rule]] |
| name = "%(app)s on %(host)s is down or crashing a lot." |
| message = "The skolo app %(app)s on %(host)s is down or is crashing a lot. The average uptime for the last 20 minutes is below 10 minutes. go/skolo-maintenance" |
| database = "skmetrics" |
| query = "select mean(value) from liveness where time > now() - 20m AND host =~ /skia-rpi/ group by app, host" |
| category = "infra" |
| conditions = ["x < 10 * 60"] |
| actions = ["Email(infra-alerts@skia.org)"] |
| auto-dismiss = true |
| nag = "1h" |