| # General |
| ALERT InstanceDown |
| IF up == 0 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}", |
| } |
| |
| ALERT CrashLoop |
| IF max_over_time(liveness_uptime_s[6m]) < 60*3 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} of job {{ $labels.job }} is crashing on startup. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}", |
| } |
| |
| ALERT ExcessiveLoad |
| IF max_over_time(load[6m]) > 200 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "{{ $labels.host }} is experiencing excessive {{ $labels.sub }} load. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=200&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.host }}" |
| } |
| |
| ALERT TooManyGoRoutines |
| IF go_goroutines > 3000 |
| FOR 2m |
| LABELS { category = "infra", severity = "warning"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.job }}", |
| description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=400&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}" |
| } |
| |
| ALERT ProbeFailure |
| IF prober{type="failure"} > 0 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.probename }} {{ $labels.url }}", |
| description = "Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ $labels.probename }}+filename%3Aprobers.json5 for the endpoint URL." |
| } |
| |
| ALERT RebootRequired |
| IF reboot_required_i > 0 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Instance {{ $labels.host }} needs rebooting. Owner(s): {{ $labels.owners }}. See https://mon.skia.org/dashboard/db/reboots-required for the full list of instances that need rebooting.", |
| } |
| |
| # GCE machines (other than bots), root disk. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-root",host!~"skia-(e|i|d|rpi)-.+"} < 1e9 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Root Disk Space on {{ $labels.host }}.", |
| } |
| |
| # GCE persistent disk mounts. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-mnt-.*",resource!~".+docker.+"} < 1e10 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Disk Space on {{ $labels.host }} on disk {{ $labels.resource }}.", |
| } |
| |
| # External bots except RPis. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-root",host=~"skia-e-[^r].+"} < 1e9 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Root Disk Space on {{ $labels.host }}. https://chromium-swarm.appspot.com/bot?id={{ $labels.host }}", |
| } |
| |
| # External RPis. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-(e-)?rpi-.+"} < 1e8 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}. https://chromium-swarm.appspot.com/bot?id={{ $labels.host }}", |
| } |
| |
| # Internal bots except RPis. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-root",host=~"skia-i-[^r].+"} < 1e9 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Root Disk Space on {{ $labels.host }}. https://chrome-swarming.appspot.com/bot?id={{ $labels.host }}", |
| } |
| |
| # Internal RPis. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-i-rpi-.+"} < 1e8 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}. https://chrome-swarming.appspot.com/bot?id={{ $labels.host }}", |
| } |
| |
| # Dev bots except RPis. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-root",host=~"skia-e-[^r].+"} < 1e9 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Root Disk Space on {{ $labels.host }}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.host }}", |
| } |
| |
| # Dev RPis. |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-d-rpi-.+"} < 1e8 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}. https://chromium-swarm-dev.appspot.com/bot?id={{ $labels.host }}", |
| } |
| |
| ALERT DirtyPackages |
| IF min_over_time(dirty_packages[25h]) >= 1 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "One or more dirty packages have been running for more than 24 hours. https://push.skia.org", |
| } |
| |
| ALERT PackageInstall |
| IF rate(pulld_failed_install[10m]) > 1 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Package failing to install via pulld on {{ $labels.host }}.", |
| } |
| |
| ALERT TooManyOpenFDs |
| IF process_open_fds > 2000 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Too many open file handles on {{ $labels.host }}.", |
| } |
| |
| # Prober |
| |
| ALERT ProberLiveness |
| IF liveness_probes_s/60 > 10 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Probing has failed to run in at least 10 minutes." |
| } |
| |
| ALERT IssueTrackerLiveness |
| IF liveness_issue_tracker_s/60 > 30 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Issue tracker ingestion has failed to run in at least 30 minutes." |
| } |
| |
| # Gold |
| |
| ALERT GoldIgnoreMonitoring |
| IF liveness_gold_expired_ignore_rules_monitoring_s{instance="skia-gold-prod:20001"} > 200 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| description = "At least two rounds of monitoring for expired ignore rules have failed back to back. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldignoremonitoring", |
| } |
| |
| ALERT GoldErrorRate |
| IF rate(num_log_lines{level="ERROR",job=~"skiacorrectness-.*"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| description = "The error rate for Gold {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#golderrorrate https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}", |
| } |
| |
| ALERT GoldDiffServerErrorRate |
| IF rate(num_log_lines{level="ERROR", instance="skia-diffserver-prod:20000"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| description = "The error rate for Gold Diffserver {{ $labels.instance }} is too high. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#golddiffservererrorrate https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}", |
| } |
| |
| ALERT GoldIngestionStalled |
| IF liveness_gold_s{metric="since-last-run",source="poll"} > 750 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/gold-panel", |
| } |
| |
| ALERT GoldIngestionErrorRate |
| IF rate(num_log_lines{level="ERROR",job=~".*_ingestion"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldingestionerrorrate https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}", |
| } |
| |
| ALERT GoldCommitTooOldWallTime |
| IF gold_last_commit_age_s{type="wall_time",job!="skiacorrectness-pdfium"} > 24*60*60 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.job }}", |
| description = "It has been at least 24 hours since the last commit made it into Gold for {{ $labels.job }}. Some process might have hung, or perhaps that repo simply has not seen a commit in that period. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcommittoooldwalltime", |
| } |
| |
| ALERT GoldCommitTooOldNewerCommit |
| IF gold_last_commit_age_s{type="with_new_commit"} > 1*60*60 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.job }}", |
| description = "It has been at least 1 hour since a new commit landed, and Gold for {{ $labels.job }} still hasn't picked it up. Some process might have hung. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldcommittoooldnewercommit", |
| } |
| |
| ALERT GoldStatusStalled |
| IF liveness_gold_status_monitoring_s > 10*60 |
| LABELS { category = "infra", severity = "critical", owner = "kjlubick@google.com"} |
| ANNOTATIONS { |
| description = "It has been at least 10 minutes since the Gold's status was re-computed. Some process might have hung. https://skia.googlesource.com/buildbot/+/refs/heads/master/golden/docs/PROD.md#goldstatusstalled", |
| } |
| |
| # Skolo |
| |
| ALERT BackupNotDone |
| IF liveness_skolo_last_backup_s{}/60/60 > 25 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.federated }} {{ $labels.backup }}", |
| description = "The automated backup of {{ $labels.backup }} was not completed by {{ $labels.federated }} in the last 25 hours. http://go/skolo-maintenance", |
| } |
| |
| ALERT RpiMaster |
| IF skolo_hotspare_spare_active == 1 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "skia-rpi-master-spare is active. Something is possibly wrong with skia-rpi-master. http://go/skolo-maintenance", |
| } |
| |
| # |
| # QPS to external services. |
| # |
| |
| # General. |
| # TODO(borenet): Specific alerts for Swarming. |
| ALERT HighExternalQPS |
| IF sum(rate(http_request_metrics{job!="skiaperf",host!="www.googleapis.com"}[30m])) by (host) > 25 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "QPS to {{ $labels.host }} is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps" |
| } |
| |
| # www.googleapis.com has a higher threshold. |
| ALERT HighExternalQPSGoogleAPIs |
| IF sum(rate(http_request_metrics{host="www.googleapis.com"}[30m])) > 60 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "QPS to www.googleapis.com is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps" |
| } |
| |
| # Datastore Backup |
| |
| ALERT Bootloop |
| IF avg_over_time(liveness_backup_step_s[5m]) < 60 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "datastore_backup appears to be in a boot loop. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#boot_loop" |
| } |
| |
| ALERT BackupNotDone |
| IF liveness_backup_success_s/60/60 > 25 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "A backup of Cloud Datastore hasn't succeeded in the last 25 hours. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done" |
| } |
| |
| |
| # alert-to-pubsub |
| |
| ALERT AlertToPubSub |
| IF rate(pubsub_send_failure[5m]) > 0.003 |
| LABELS { category = "infra", severity = "critical", owner = "jcgregorio@google.com" } |
| ANNOTATIONS { |
| abbr = "google.com:skia-buildbots", |
| description = "Failed to send alert via PubSub. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}" |
| } |