| # General |
| ALERT InstanceDown |
| IF up == 0 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}", |
| } |
| |
| ALERT CrashLoop |
| IF max_over_time(liveness_uptime_s[6m]) < 60*3 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} of job {{ $labels.job }} is crashing on startup. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}", |
| } |
| |
| ALERT ExcessiveLoad |
| IF max_over_time(load[6m]) > 200 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "{{ $labels.host }} is experiencing excessive {{ $labels.sub }} load. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=200&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.host }}" |
| } |
| |
| ALERT TooManyGoRoutines |
| IF go_goroutines > 3000 |
| FOR 2m |
| LABELS { category = "infra", severity = "warning"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.job }}", |
| description = "Too many Go routines in {{ $labels.job }} running on {{ $labels.instance }}. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=400&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}" |
| } |
| |
| ALERT ProbeFailure |
| IF prober{type="failure"} > 0 |
| FOR 5m |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.probename }} {{ $labels.url }}", |
| description = "Endpoint {{ $labels.probename }} {{ $labels.url }} has failed to respond in at least 5 minutes. See https://github.com/google/skia-buildbot/search?q={{ $labels.probename }}+filename%3Aprobers.json5 for the endpoint URL." |
| } |
| |
| ALERT RebootRequired |
| IF reboot_required_i > 0 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Instance {{ $labels.host }} needs rebooting. Owner(s): {{ $labels.owners }}. See https://mon.skia.org/dashboard/db/reboots-required for the full list of instances that need rebooting.", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource="df-root",host!~".*rpi-.+"} < 1e9 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Root Disk Space on {{ $labels.host }}.", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-mnt-.*",resource!~".+docker.+"} < 1e10 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Low Disk Space on {{ $labels.host }} on disk {{ $labels.resource }}.", |
| } |
| |
| ALERT DiskSpaceLow |
| IF df_complex_free{resource=~"df-var|df-tmp",host=~"skia-rpi-.+"} < 1e8 |
| FOR 5m |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Free space has fallen below 100MB on {{ $labels.host }} drive {{ $labels.resource}}.", |
| } |
| |
| ALERT DirtyPackages |
| IF min_over_time(dirty_packages[25h]) >= 1 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "One or more dirty packages have been running for more than 24 hours. https://push.skia.org", |
| } |
| |
| ALERT PackageInstall |
| IF rate(pulld_failed_install[10m]) > 1 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Package failing to install via pulld on {{ $labels.host }}.", |
| } |
| |
| ALERT TooManyOpenFDs |
| IF process_open_fds > 2000 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.host }}", |
| description = "Too many open file handles on {{ $labels.host }}.", |
| } |
| |
| |
| # Datahopper |
| |
| ALERT DatahopperErrorRate |
| IF rate(num_log_lines{level="ERROR",log_source="datahopper"}[10m]) > 5 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| description = "The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-datahopper2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fdatahopper" |
| } |
| |
| ALERT JobMetricsLiveness |
| IF liveness_last_successful_job_metrics_update_s/60 > 30 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to update job metrics for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#job_metrics" |
| } |
| |
| ALERT BotCoverageMetricsLiveness |
| IF liveness_last_successful_bot_coverage_metrics_s/60 > 60 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to update bot coverage metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#bot_coverage_metrics" |
| } |
| |
| ALERT SwarmingTaskMetricsLiveness |
| IF liveness_last_successful_swarming_task_metrics_s/60 > 60 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to update swarming task metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#swarming_task_metrics" |
| } |
| |
| ALERT EventMetricsLiveness |
| IF liveness_last_successful_event_metrics_update_s/60 > 30 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to update event metrics for {{ $labels.measurement }} for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#event_metrics" |
| } |
| |
| ALERT SwarmingBotMetricsLiveness |
| IF liveness_last_successful_report_bot_metrics_s/60 > 10 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.pool }}", |
| description = "{{ $labels.instance }} has failed to update swarming task metrics for pool {{ $labels.pool }} on {{ $labels.server }} for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#swarming_bot_metrics" |
| } |
| |
| # Swarming |
| |
| ALERT BotMissing |
| IF swarming_bots_last_seen{bot!~"(ct-gce-.*)|(build4.+device.+)"}/1000/1000/1000/60 > 15 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.bot }}", |
| description = "Swarming bot {{ $labels.bot }} is missing. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| ALERT CtGceBotMissing |
| IF max(swarming_bots_last_seen{bot=~"ct-gce-.*"})/1024/1024/1024/60 * max(ct_gce_bots_up) > 15 |
| LABELS { category = "infra", severity = "critical", owner = "rmistry@google.com"} |
| ANNOTATIONS { |
| description = "1 or more CT GCE bots are down: https://chrome-swarming.appspot.com/botlist?f=status%3Adead&f=gpu%3Anone&f=pool%3ACT&l=100" |
| } |
| |
| ALERT BotUnemployed |
| IF swarming_bots_last_task{pool=~"Skia.*"}/1000/1000/1000/60/60 >= 72 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.bot }}", |
| description = "Swarming bot {{ $labels.bot }} hasn't run a job in 72 hours. Maybe its dimensions need changing? https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| ALERT BotQuarantined |
| IF avg_over_time(swarming_bots_quarantined{device_state!~"(too_hot)|(low_battery)"}[10m]) >= 1 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.bot }}", |
| description = "Swarming bot {{ $labels.bot }} is quarantined. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| ALERT DeviceUnhealthy |
| IF avg_over_time(swarming_bots_quarantined{device_state=~"(too_hot)|(low_battery)"}[1h]) >= 1 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.bot }}", |
| description = "Swarming bot {{ $labels.bot }} is quarantined because the device is {{ $labels.device_state }} and hasn't resolved itself in 1+ hours. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| ALERT SwarmingBotRebootRequired |
| IF avg_over_time(swarming_bots_reboot_required[10m]) >= 1 |
| LABELS { category = "infra", severity = "warning"} |
| ANNOTATIONS { |
| abbr = "{{ $labels.bot }}", |
| description = "Swarming bot {{ $labels.bot }} requires reboot. After reboot, check if os dimension has changed. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance" |
| } |
| |
| ALERT WindowsSkoloOSVersion |
| IF round(avg_over_time(windows_skolo_os_version_count{pool="Skia"}[1h])) != 1 |
| LABELS { category = "infra", severity = "warning"} |
| ANNOTATIONS { |
| description = "Windows Skolo bots OS version has diverged. https://goto.google.com/skolo-maintenance" |
| } |
| |
| |
| # Status |
| |
| ALERT StatusLatency |
| IF avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024 > 10 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| description = "The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond." |
| } |
| |
| ALERT StatusIncrementalCacheUpdate |
| IF liveness_last_successful_incremental_cache_update_s > 5*60 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| description = "IncrementalCache UpdateLoop on {{ $labels.instance }} has failed to update data for more than 5 minutes. Playbook: Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fstatus" |
| } |
| |
| # Prober |
| |
| ALERT ProberLiveness |
| IF liveness_probes_s/60 > 10 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Probing has failed to run in at least 10 minutes." |
| } |
| |
| ALERT IssueTrackerLiveness |
| IF liveness_issue_tracker_s/60 > 30 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "Issue tracker ingestion has failed to run in at least 30 minutes." |
| } |
| |
| # Gold |
| |
| ALERT GoldIgnoreMonitoring |
| IF liveness_gold_expired_ignore_rules_monitoring_s{instance="skia-gold-prod:20001"} > 200 |
| LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"} |
| ANNOTATIONS { |
| description = "At least two rounds of monitoring for expired ignore rules have failed back to back.", |
| } |
| |
| ALERT GoldErrorRate |
| IF rate(num_log_lines{level="ERROR",job=~"skiacorrectness-.*"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"} |
| ANNOTATIONS { |
| description = "The error rate for Gold {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}", |
| } |
| |
| ALERT GoldDiffServerErrorRate |
| IF rate(num_log_lines{level="ERROR", instance="skia-diffserver-prod:20000"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"} |
| ANNOTATIONS { |
| description = "The error rate for Gold Diffserver {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}", |
| } |
| |
| ALERT GoldIngestionStalled |
| IF liveness_gold_s{metric="since-last-run",source="poll"} > 750 |
| LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"} |
| ANNOTATIONS { |
| description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/gold-panel", |
| } |
| |
| ALERT GoldIngestionErrorRate |
| IF rate(num_log_lines{level="ERROR",job=~".*_ingestion"}[2m]) > 1 |
| LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"} |
| ANNOTATIONS { |
| description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See https://mon.skia.org/dashboard/db/gold-panel https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}", |
| } |
| |
| # Task Scheduler |
| |
| ALERT TaskSchedulerLiveness |
| IF liveness_last_successful_task_scheduling_s/60 > 10 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#scheduling_failed" |
| } |
| |
| ALERT TaskSchedulerLatency |
| IF prober{type="latency",probename="task_scheduler"} > 300 |
| FOR 10m |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.probename }}", |
| description = "The endpoint for {{ $labels.probename }} took more than 300ms to respond. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#http_latency" |
| } |
| |
| ALERT TaskSchedulerErrorRate |
| IF rate(num_log_lines{level="ERROR",log_source="task_scheduler"}[2m]) > 0.05 |
| FOR 2m |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "The error rate for task_scheduler on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#error_rate" |
| } |
| |
| ALERT TaskSchedulerDBBackup |
| IF liveness_last_db_backup_s/60/60 > 25 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "The last Task Scheduler DB backup on {{ $labels.instance }} was more than 25 hours ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#old_db_backup" |
| } |
| |
| ALERT TaskSchedulerExtraDBBackups |
| IF recent_db_backup_count > 9 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "There are too many recent Task Scheduler DB backups for {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_recent_db_backups" |
| } |
| |
| ALERT TaskSchedulerDBBackupTrigger |
| IF liveness_db_backup_maybe_backup_db_s/60 > 10 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "The last time we checked for a Task Scheduler DB backup trigger file on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_backup_trigger_liveness" |
| } |
| |
| ALERT TaskSchedulerIncrementalBackup |
| IF liveness_incremental_backup_s/60 > 10 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "The last time a Task Scheduler incremental backup succeeded on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_liveness" |
| } |
| |
| ALERT TaskSchedulerIncrementalBackupReset |
| IF incremental_backup_reset > 0 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_reset" |
| } |
| |
| ALERT TaskSchedulerDBFreePages |
| IF avg_over_time(bolt_db{metric="FreePageCount",database="task_scheduler_db",instance!="skia-task-scheduler-internal:20000"}[30m]) > 1000 |
| FOR 1h |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "There are a large number of free pages in the Task Scheduler DB on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_too_many_free_pages" |
| } |
| |
| ALERT DbMetricsLiveness |
| IF liveness_DbMetric_s/60 > 30 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to update boltutil.DbMetrics for the last 30 minutes. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}" |
| } |
| |
| |
| ALERT NightlyTrigger |
| IF liveness_periodic_trigger_s{trigger="nightly"}/60/60 > 25 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "The nightly trigger has not run in over 25 hours on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_nightly" |
| } |
| |
| ALERT WeeklyTrigger |
| IF liveness_periodic_trigger_s{trigger="weekly"}/60/60/24 > 8 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "The weekly trigger has not run in over 8 days on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_weekly" |
| } |
| |
| ALERT TaskSchedulerTooManyCandidates |
| IF task_candidate_count > 1500 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "There are too many task candidates for dimensions: {{ $labels.dimensions }} https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_candidates" |
| } |
| |
| |
| ALERT OverdueMetricsLiveness |
| IF liveness_last_successful_overdue_metrics_update_s/60 > 10 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.instance }}", |
| description = "{{ $labels.instance }} has failed to update overdue_job_specs_s for the last 10 minutes. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }} Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_metrics_liveness" |
| } |
| |
| # These jobs have tasks with an expiration of 4 hours, and we allow 2 attempts, so they should |
| # normally finish within 8 hours. |
| ALERT OverdueJobSpec |
| IF overdue_job_specs_s{job_trigger=~"|master",job_name!~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 8 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.job_name }}", |
| description = "{{ $labels.job_name }} has not finished for any commit in the last 8 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec" |
| } |
| |
| # These jobs have tasks with an expiration of 9 hours, and we allow 2 attempts, so they should |
| # normally finish within 18 hours. |
| ALERT OverdueJobSpecLong |
| IF overdue_job_specs_s{job_trigger=~"|master",job_name=~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 18 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.job_name }}", |
| description = "{{ $labels.job_name }} has not finished for any commit in the last 9 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec" |
| } |
| |
| # Note: We don't have an alert for job_trigger="weekly" because Task Scheduler's |
| # scheduling window is only four days. |
| ALERT OverdueJobSpecNightly |
| IF overdue_job_specs_s{job_trigger="nightly"}/60/60 > 28 |
| LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.job_name }}", |
| description = "{{ $labels.job_name }} has not completed in the last 28 hours (nightly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec" |
| } |
| |
| |
| # Skolo |
| |
| ALERT BackupNotDone |
| IF liveness_skolo_last_backup_s{}/60/60 > 25 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| abbr = "{{ $labels.federated }} {{ $labels.backup }}", |
| description = "The automated backup of {{ $labels.backup }} was not completed by {{ $labels.federated }} in the last 25 hours. http://go/skolo-maintenance", |
| } |
| |
| ALERT RpiMaster |
| IF skolo_hotspare_spare_active == 1 |
| LABELS { category = "infra", severity = "critical"} |
| ANNOTATIONS { |
| description = "skia-rpi-master-spare is active. Something is possibly wrong with skia-rpi-master. http://go/skolo-maintenance", |
| } |
| |
| # |
| # QPS to external services. |
| # |
| |
| # General. |
| # TODO(borenet): Specific alerts for Swarming. |
| ALERT HighExternalQPS |
| IF sum(rate(http_request_metrics{job!="skiaperf",host!="www.googleapis.com"}[30m])) by (host) > 25 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "QPS to {{ $labels.host }} is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps" |
| } |
| |
| # www.googleapis.com has a higher threshold. |
| ALERT HighExternalQPSGoogleAPIs |
| IF sum(rate(http_request_metrics{host="www.googleapis.com"}[30m])) > 60 |
| LABELS { category = "infra", severity = "warning" } |
| ANNOTATIONS { |
| description = "QPS to www.googleapis.com is high. Verify that this is expected. See https://mon.skia.org/dashboard/db/outgoing-qps" |
| } |
| |
| # Datastore Backup |
| |
| ALERT Bootloop |
| IF avg_over_time(liveness_backup_step_s[5m]) < 60 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "datastore_backup appears to be in a boot loop. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#boot_loop" |
| } |
| |
| ALERT BackupNotDone |
| IF liveness_backup_success_s/60/60 > 25 |
| LABELS { category = "infra", severity = "critical" } |
| ANNOTATIONS { |
| description = "A backup of Cloud Datastore hasn't succeeded in the last 25 hours. https://skia.googlesource.com/buildbot/%2B/master/ds/PROD.md#backup_not_done" |
| } |
| |
| |
| # alert-to-pubsub |
| |
| ALERT AlertToPubSub |
| IF rate(pubsub_send_failure[5m]) > 0.003 |
| LABELS { category = "infra", severity = "critical", owner = "jcgregorio@google.com" } |
| ANNOTATIONS { |
| abbr = "google.com:skia-buildbots", |
| description = "Failed to send alert via PubSub. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}" |
| } |