blob: 7a42545b093aa32a770e7b50b837e6187c9d8332 [file] [log] [blame]
# This file defines alerts to be triggered by the server.
[[rule]]
name = "Perf Alerts"
message = "At least one perf alert has been found. Please visit https://skiaperf.com/alerts/ to triage."
query = "select value from skiaperf.alerting.new.value limit 1"
condition = "x > 0"
actions = ["Email(alerts@skia.org)"]
auto-dismiss = true
nag = "24h"
[[rule]]
name = "Buildslaves offline (client.skia)"
message = "At least one buildslave has been offline for more than ten minutes: http://build.chromium.org/p/client.skia/buildslaves"
query = "select mean(value) from prober.master_host.failure.value where time > now() - 10m"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
[[rule]]
name = "Buildslaves offline (client.skia.android)"
message = "At least one buildslave has been offline for more than ten minutes: http://build.chromium.org/p/client.skia.android/buildslaves"
query = "select mean(value) from prober.android_master_host.failure.value where time > now() - 10m"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
[[rule]]
name = "Buildslaves offline (client.skia.compile)"
message = "At least one buildslave has been offline for more than ten minutes: http://build.chromium.org/p/client.skia.compile/buildslaves"
query = "select mean(value) from prober.compile_master_host.failure.value where time > now() - 10m"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
[[rule]]
name = "Buildslaves offline (client.skia.fyi)"
message = "At least one buildslave has been offline for more than ten minutes: http://build.chromium.org/p/client.skia.fyi/buildslaves"
query = "select mean(value) from prober.fyi_master_host.failure.value where time > now() - 10m"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
[[rule]]
name = "Ingestion Failure (nanobench)"
message = "At least two rounds of data ingestion have failed back to back."
query = "select mean(value) from ingest.ingester.nano-ingest.gauge.time-since-last-successful-update.value where time > now() - 10m"
condition = "x >= 750"
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = false
nag = "1h"
[[rule]]
name = "Skia Fiddle Prober (main page)"
message = "The main page at http://skfiddle.com has failed."
query = "select mean(value) from prober.skfiddle.failure.value where time > now() - 10m;"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)", "Email(humper@google.com)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "Skia Fiddle Prober (compile)"
message = "A test compile at http://skfiddle.com has failed for more than 10 minutes."
query = "select mean(value) from prober.skfiddle_compile_bad.failure.value where time > now() - 10m;"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)", "Email(humper@google.com)"]
auto-dismiss = true
nag = "1h"
[[rule]]
name = "Skia Fiddle Prober (compile)"
message = "A test compile at http://skfiddle.com has failed for more than 10 minutes."
query = "select mean(value) from prober.skfiddle_compile_good.failure.value where time > now() - 10m;"
condition = "x >= 1"
actions = ["Email(infra-alerts@skia.org)", "Email(humper@google.com)"]
auto-dismiss = true
nag = "1h"