[am] Start adding owners to alerts am.skia.org should soon have support for handling these owner labels Bug: skia:8645 Change-Id: I0e79a149e75863f31b53a994528eb957b1949b87 Reviewed-on: https://skia-review.googlesource.com/c/179244 Commit-Queue: Ravi Mistry <rmistry@google.com> Reviewed-by: Eric Boren <borenet@google.com>

commit: 69de8750f5bcd492e695f29694b4a0ff3885ee13 [log] [tgz]
author: Ravi Mistry <rmistry@google.com> Fri Dec 21 09:08:41 2018 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> Fri Dec 21 15:13:20 2018 +0000
tree: bd766ded6b3e7918c25fae7bb605e70d78b5282a
parent: 4b559012c8012f2f3b6d48ce6590f14c2ea2de30 [diff]
diff --git a/prometheus/sys/alert.rules b/prometheus/sys/alert.rules
index ec940e8..7d4b260 100644
--- a/prometheus/sys/alert.rules
+++ b/prometheus/sys/alert.rules

@@ -107,14 +107,14 @@
 
 ALERT DatahopperErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="datahopper"}[10m]) > 5
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     description = "The error rate for datahopper is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2Fskia-datahopper2&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fdatahopper"
   }
 
 ALERT JobMetricsLiveness
   IF liveness_last_successful_job_metrics_update_s/60 > 30
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to update job metrics for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#job_metrics"
@@ -122,7 +122,7 @@
 
 ALERT BotCoverageMetricsLiveness
   IF liveness_last_successful_bot_coverage_metrics_s/60 > 60
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to update bot coverage metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#bot_coverage_metrics"
@@ -130,7 +130,7 @@
 
 ALERT SwarmingTaskMetricsLiveness
   IF liveness_last_successful_swarming_task_metrics_s/60 > 60
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to update swarming task metrics for the last 1 hour. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#swarming_task_metrics"
@@ -138,7 +138,7 @@
 
 ALERT EventMetricsLiveness
   IF liveness_last_successful_event_metrics_update_s/60 > 30
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to update event metrics for {{ $labels.measurement }} for the last 30 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#event_metrics"
@@ -146,7 +146,7 @@
 
 ALERT SwarmingBotMetricsLiveness
   IF liveness_last_successful_report_bot_metrics_s/60 > 10
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.pool }}",
     description = "{{ $labels.instance }} has failed to update swarming task metrics for pool {{ $labels.pool }} on {{ $labels.server }} for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/datahopper/PROD.md#swarming_bot_metrics"
@@ -164,7 +164,7 @@
 
 ALERT CtGceBotMissing
   IF max(swarming_bots_last_seen{bot=~"ct-gce-.*"})/1024/1024/1024/60 * max(ct_gce_bots_up) > 15
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "rmistry@google.com"}
   ANNOTATIONS {
     description = "1 or more CT GCE bots are down: https://chrome-swarming.appspot.com/botlist?f=status%3Adead&f=gpu%3Anone&f=pool%3ACT&l=100"
   }
@@ -213,14 +213,14 @@
 
 ALERT StatusLatency
   IF avg_over_time(prober{probename="skiastatus_json",type="latency"}[10m])/1024  > 10
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     description = "The JSON endpoint at https://status.skia.org/json/skia/commits/ took more than 10s to respond."
   }
 
 ALERT StatusIncrementalCacheUpdate
   IF liveness_last_successful_incremental_cache_update_s > 5*60
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     description = "IncrementalCache UpdateLoop on {{ $labels.instance }} has failed to update data for more than 5 minutes. Playbook: Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2Fstatus"
   }
@@ -245,35 +245,35 @@
 
 ALERT GoldIgnoreMonitoring
   IF liveness_gold_expired_ignore_rules_monitoring_s{instance="skia-gold-prod:20001"} > 200
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"}
   ANNOTATIONS {
     description = "At least two rounds of monitoring for expired ignore rules have failed back to back.",
   }
 
 ALERT GoldErrorRate
   IF rate(num_log_lines{level="ERROR",job=~"skiacorrectness-.*"}[2m]) > 1
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"}
   ANNOTATIONS {
     description = "The error rate for Gold {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}",
   }
 
 ALERT GoldDiffServerErrorRate
   IF rate(num_log_lines{level="ERROR", instance="skia-diffserver-prod:20000"}[2m]) > 1
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"}
   ANNOTATIONS {
     description = "The error rate for Gold Diffserver {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}",
   }
 
 ALERT GoldIngestionStalled
   IF liveness_gold_s{metric="since-last-run",source="poll"} > 750
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"}
   ANNOTATIONS {
     description = "At least two rounds of gold ingestion have failed back to back for {{ $labels.corpus }}. See https://mon.skia.org/dashboard/db/gold-panel",
   }
 
 ALERT GoldIngestionErrorRate
   IF rate(num_log_lines{level="ERROR",job=~".*_ingestion"}[2m]) > 1
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "stephana@google.com"}
   ANNOTATIONS {
     description = "The error rate for Gold Ingestion {{ $labels.corpus }} is too high. See https://mon.skia.org/dashboard/db/gold-panel https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }}",
   }
@@ -282,7 +282,7 @@
 
 ALERT TaskSchedulerLiveness
   IF liveness_last_successful_task_scheduling_s/60 > 10
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to schedule for the last 10 minutes. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#scheduling_failed"
@@ -291,7 +291,7 @@
 ALERT TaskSchedulerLatency
   IF prober{type="latency",probename="task_scheduler"} > 300
   FOR 10m
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.probename }}",
     description = "The endpoint for {{ $labels.probename }} took more than 300ms to respond. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#http_latency"
@@ -300,7 +300,7 @@
 ALERT TaskSchedulerErrorRate
   IF rate(num_log_lines{level="ERROR",log_source="task_scheduler"}[2m]) > 0.05
   FOR 2m
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "The error rate for task_scheduler on {{ $labels.instance }} is too high. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ $labels.log_group }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.log_source }} https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#error_rate"
@@ -308,7 +308,7 @@
 
 ALERT TaskSchedulerDBBackup
   IF liveness_last_db_backup_s/60/60 > 25
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "The last Task Scheduler DB backup on {{ $labels.instance }} was more than 25 hours ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#old_db_backup"
@@ -316,7 +316,7 @@
 
 ALERT TaskSchedulerExtraDBBackups
   IF recent_db_backup_count > 9
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "There are too many recent Task Scheduler DB backups for {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_recent_db_backups"
@@ -324,7 +324,7 @@
 
 ALERT TaskSchedulerDBBackupTrigger
   IF liveness_db_backup_maybe_backup_db_s/60 > 10
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "The last time we checked for a Task Scheduler DB backup trigger file on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_backup_trigger_liveness"
@@ -332,7 +332,7 @@
 
 ALERT TaskSchedulerIncrementalBackup
   IF liveness_incremental_backup_s/60 > 10
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "The last time a Task Scheduler incremental backup succeeded on {{ $labels.instance }} was more than 10 minutes ago. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_liveness"
@@ -340,7 +340,7 @@
 
 ALERT TaskSchedulerIncrementalBackupReset
   IF incremental_backup_reset > 0
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "Task Scheduler modified job tracking for incremental backups has been reset since last full backup on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#incremental_backup_reset"
@@ -349,7 +349,7 @@
 ALERT TaskSchedulerDBFreePages
   IF avg_over_time(bolt_db{metric="FreePageCount",database="task_scheduler_db",instance!="skia-task-scheduler-internal:20000"}[30m]) > 1000
   FOR 1h
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "There are a large number of free pages in the Task Scheduler DB on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#db_too_many_free_pages"
@@ -357,7 +357,7 @@
 
 ALERT DbMetricsLiveness
   IF liveness_DbMetric_s/60 > 30
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to update boltutil.DbMetrics for the last 30 minutes. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}"
@@ -366,7 +366,7 @@
 
 ALERT NightlyTrigger
   IF liveness_periodic_trigger_s{trigger="nightly"}/60/60 > 25
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "The nightly trigger has not run in over 25 hours on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_nightly"
@@ -374,7 +374,7 @@
 
 ALERT WeeklyTrigger
   IF liveness_periodic_trigger_s{trigger="weekly"}/60/60/24 > 8
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "The weekly trigger has not run in over 8 days on {{ $labels.instance }}. https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#trigger_weekly"
@@ -382,7 +382,7 @@
 
 ALERT TaskSchedulerTooManyCandidates
   IF task_candidate_count > 1500
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "There are too many task candidates for dimensions: {{ $labels.dimensions }} https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#too_many_candidates"
@@ -391,7 +391,7 @@
 
 ALERT OverdueMetricsLiveness
   IF liveness_last_successful_overdue_metrics_update_s/60 > 10
-  LABELS { category = "infra", severity = "critical"}
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.instance }}",
     description = "{{ $labels.instance }} has failed to update overdue_job_specs_s for the last 10 minutes. Logs: https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }} Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_metrics_liveness"
@@ -401,7 +401,7 @@
 # normally finish within 8 hours.
 ALERT OverdueJobSpec
   IF overdue_job_specs_s{job_trigger=~"|master",job_name!~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 8
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.job_name }}",
     description = "{{ $labels.job_name }} has not finished for any commit in the last 8 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
@@ -411,7 +411,7 @@
 # normally finish within 18 hours.
 ALERT OverdueJobSpecLong
   IF overdue_job_specs_s{job_trigger=~"|master",job_name=~".*(Valgrind|MSAN|-x86-).*"}/60/60 > 18
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.job_name }}",
     description = "{{ $labels.job_name }} has not finished for any commit in the last 9 hours. Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
@@ -421,7 +421,7 @@
 # scheduling window is only four days.
 ALERT OverdueJobSpecNightly
   IF overdue_job_specs_s{job_trigger="nightly"}/60/60 > 28
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "borenet@google.com" }
   ANNOTATIONS {
     abbr = "{{ $labels.job_name }}",
     description = "{{ $labels.job_name }} has not completed in the last 28 hours (nightly job). Maybe the dimensions need changing? (Job defined here: {{ $labels.repo }}/+/master/infra/bots/tasks.json) Production Manual: https://skia.googlesource.com/buildbot/%2B/master/task_scheduler/PROD.md#overdue_job_spec"
@@ -487,7 +487,7 @@
 
 ALERT AlertToPubSub
   IF rate(pubsub_send_failure[5m]) > 0.003
-  LABELS { category = "infra", severity = "critical" }
+  LABELS { category = "infra", severity = "critical", owner = "jcgregorio@google.com" }
   ANNOTATIONS {
     abbr = "google.com:skia-buildbots",
     description = "Failed to send alert via PubSub. https://console.cloud.google.com/logs/viewer?project=google.com:skia-buildbots&minLogLevel=500&expandAll=false&resource=logging_log%2Fname%2F{{ reReplaceAll `:[0-9]+` `` $labels.instance }}&logName=projects%2Fgoogle.com:skia-buildbots%2Flogs%2F{{ $labels.job }}"

diff --git a/promk/prometheus/alerts_corp.yml b/promk/prometheus/alerts_corp.yml
index 5fc7644..75be6fb 100644
--- a/promk/prometheus/alerts_corp.yml
+++ b/promk/prometheus/alerts_corp.yml

@@ -8,6 +8,7 @@
     labels:
       category: infra
       severity: warning
+      owner: borenet@google.com
     annotations:
       description: 'The last DEPS roll into internal_test repo failed. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#autoroll_failed'
 
@@ -16,6 +17,7 @@
     labels:
       category: infra
       severity: warning
+      owner: borenet@google.com
     annotations:
       description: 'The last-landed roll into internal_test was over 24h ago. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#no_rolls_24h'
 
@@ -26,6 +28,7 @@
     labels:
       category: infra
       severity: warning
+      owner: rmistry@google.com
     annotations:
       abbr: 'Too many waiting android compile tasks'
       description: 'There are 5 or more waiting android compile tasks. https://skia.googlesource.com/buildbot/%2B/master/android_compile/PROD.md#queue_too_long'
@@ -36,6 +39,7 @@
     labels:
       category: infra
       severity: warning
+      owner: rmistry@google.com
     annotations:
       abbr: 'Mirror sync in android compile server failed'
       description: 'Mirror sync in android compile server failed. https://skia.googlesource.com/buildbot/%2B/master/android_compile/PROD.md#mirror_sync_failed'
@@ -46,6 +50,7 @@
     labels:
       category: infra
       severity: warning
+      owner: rmistry@google.com
     annotations:
       abbr: 'Android tree might be broken'
       description: 'Android tree might be broken. Both withpatch and nopatch runs failed on android compile server. https://skia.googlesource.com/buildbot/%2B/master/android_compile/PROD.md#android_tree_broken'
@@ -56,6 +61,7 @@
     labels:
       category: infra
       severity: critical
+      owner: rmistry@google.com
     annotations:
       abbr: 'Infra failure in android compile bot'
       description: 'Infra failure in android compile bot. https://skia.googlesource.com/buildbot/%2B/master/android_compile/PROD.md#infra_failure'

diff --git a/promk/prometheus/alerts_general.yml b/promk/prometheus/alerts_general.yml
index 4166cdc..abe056b 100644
--- a/promk/prometheus/alerts_general.yml
+++ b/promk/prometheus/alerts_general.yml

@@ -84,6 +84,7 @@
     labels:
       category: infra
       severity: critical
+      owner: borenet@google.com
     annotations:
       description: 'The error rate for autoroll on {{ $labels.app }} is too high.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
@@ -94,6 +95,7 @@
     labels:
       category: infra
       severity: critical
+      owner: borenet@google.com
     annotations:
       description: 'Autoroll on {{ $labels.instance }} has failed to transition for more than 10 minutes.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}
@@ -121,6 +123,7 @@
     labels:
       category: infra
       severity: critical
+      owner: borenet@google.com
     annotations:
       description: 'Autoroll on {{ $labels.instance }} has failed to obtain the current sheriff for more than 2 hours. Please verify that the sheriff endpoint is working and that the rotation schedule is not empty.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2F{{ $labels.project }}%2Flogs%2F{{ $labels.app }}

diff --git a/promk/prometheus/alerts_public.yml b/promk/prometheus/alerts_public.yml
index cdd6a28..e916e22 100644
--- a/promk/prometheus/alerts_public.yml
+++ b/promk/prometheus/alerts_public.yml

@@ -51,6 +51,7 @@
     labels:
       category: infra
       severity: warning
+      owner: jcgregorio@google.com
     annotations:
       abbr: 'Insufficient fiddler pods.'
       description: 'Fiddle is experiencing heavy load and has insufficient idle fiddler pods. https://skia.googlesource.com/buildbot/%2B/master/fiddlek/PROD.md#fiddler_pods'
@@ -61,6 +62,7 @@
     labels:
       category: infra
       severity: warning
+      owner: jcgregorio@google.com
     annotations:
       abbr: 'named fiddles failing'
       description: 'Some named fiddles are failing. Visit https://named-fiddles.skia.org to see which ones.'
@@ -71,6 +73,7 @@
     labels:
       category: infra
       severity: warning
+      owner: jcgregorio@google.com
     annotations:
       abbr: 'Fiddler pod communication errors.'
       description: 'Fiddle is having trouble communicating with fiddler pods.'
@@ -111,6 +114,7 @@
     labels:
       category: infra
       severity: critical
+      owner: kjlubick@google.com
     annotations:
       abbr: 'Full Upload Queue'
       description: 'Fuzzer upload queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_upload'
@@ -120,6 +124,7 @@
     labels:
       category: infra
       severity: warning
+      owner: kjlubick@google.com
     annotations:
       abbr: 'Fuzzer version stale'
       description: 'The Fuzzer hasnt rolled its version forward in 10 days.  Roll it forward on fuzzer.skia.org https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#stale_version'
@@ -129,6 +134,7 @@
     labels:
       category: infra
       severity: critical
+      owner: kjlubick@google.com
     annotations:
       abbr: 'Fuzzer roll taking a while'
       description: 'The fuzzer hasnt finished rolling its version forward in 2 hours.  Something might be wrong.  https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#broken_roll'
@@ -139,6 +145,7 @@
     labels:
       category: infra
       severity: critical
+      owner: kjlubick@google.com
     annotations:
       abbr: 'Full Analysis Queue'
       description: 'Fuzzer analysis queue has been very full on {{ $labels.host }}. https://skia.googlesource.com/buildbot/%2B/master/fuzzer/PROD.md#full_analysis'
@@ -159,6 +166,7 @@
     labels:
       category: infra
       severity: critical
+      owner: jcgregorio@google.com
     annotations:
       abbr: alert-to-pubsub
       description: 'alert-to-pubsub has failed to send a healthz PubSub event in 90s. https://skia.googlesource.com/buildbot/%2B/master/am/PROD.md#alert_to_pubsub'
@@ -171,6 +179,7 @@
     labels:
       category: infra
       severity: critical
+      owner: rmistry@google.com
     annotations:
       abbr: 'CT poller health check failed.'
       description: 'CT poller health check is failing. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ct_poller_health_check'
@@ -181,6 +190,7 @@
     labels:
       category: infra
       severity: critical
+      owner: rmistry@google.com
     annotations:
       abbr: 'CTFE pending task count too high.'
       description: 'There are a lot of CTFE pending tasks. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'
@@ -191,6 +201,7 @@
     labels:
       category: infra
       severity: critical
+      owner: rmistry@google.com
     annotations:
       abbr: 'CTFE pending task not running.'
       description: 'A task has been waiting to be executed for a while and it has still not started. https://skia.googlesource.com/buildbot/%2B/master/ct/PROD.md#ctfe_pending_tasks'
@@ -201,6 +212,7 @@
     labels:
       category: infra
       severity: critical
+      owner: borenet@google.com
     annotations:
       description: 'The endpoint for {{ $labels.probename }} {{ $labels.url }} took more than 200ms to respond. https://skia.googlesource.com/buildbot/%2B/master/autoroll/PROD.md#http_latency'
 
@@ -209,6 +221,7 @@
     labels:
       category: infra
       severity: critical
+      owner: borenet@google.com
     annotations:
       description: 'The error rate for autoroll on {{ $labels.instance }} is too high.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
@@ -221,6 +234,7 @@
     labels:
       category: infra
       severity: critical
+      owner: rmistry@google.com
     annotations:
       description: 'Autoroll on {{ $labels.app }} has failed to transition for more than 30 minutes.
       https://console.cloud.google.com/logs/viewer?organizationId=433637338589&project={{ $labels.project }}&minLogLevel=500&resource=container&logName=projects%2Fskia-public%2Flogs%2F{{ $labels.app }}
@@ -232,6 +246,7 @@
     labels:
       category: infra
       severity: critical
+      owner: jcgregorio@google.com
     annotations:
       description: 'Error rate for processing buildids is too high. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#process_failures'
 
@@ -240,6 +255,7 @@
     labels:
       category: infra
       severity: critical
+      owner: jcgregorio@google.com
     annotations:
       description: 'android_ingest failing to record incoming data to transaction log. See https://github.com/google/skia-buildbot/blob/master/android_ingest/PROD.md#tx_log'
commit	69de8750f5bcd492e695f29694b4a0ff3885ee13	[log] [tgz]
author	Ravi Mistry <rmistry@google.com>	Fri Dec 21 09:08:41 2018 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	Fri Dec 21 15:13:20 2018 +0000
tree	bd766ded6b3e7918c25fae7bb605e70d78b5282a
parent	4b559012c8012f2f3b6d48ce6590f14c2ea2de30 [diff]