Add alerts when bots don't run a job for 72 hours.
Bug: skia:
Change-Id: Ifd2b987e0a5a4ac7b27f6d99ea4abbfc87d7d6f3
Reviewed-on: https://skia-review.googlesource.com/36400
Reviewed-by: Eric Boren <borenet@google.com>
Commit-Queue: Kevin Lubick <kjlubick@google.com>
diff --git a/datahopper/go/swarming_metrics/bots.go b/datahopper/go/swarming_metrics/bots.go
index b5d9c71..6826f60 100644
--- a/datahopper/go/swarming_metrics/bots.go
+++ b/datahopper/go/swarming_metrics/bots.go
@@ -12,6 +12,7 @@
const (
MEASUREMENT_SWARM_BOTS_LAST_SEEN = "swarming_bots_last_seen"
MEASUREMENT_SWARM_BOTS_QUARANTINED = "swarming_bots_quarantined"
+ MEASUREMENT_SWARM_BOTS_LAST_TASK = "swarming_bots_last_task"
)
// cleanupOldMetrics deletes old metrics, replace with new ones. This fixes the case where
@@ -42,7 +43,7 @@
for _, bot := range bots {
last, err := time.Parse("2006-01-02T15:04:05", bot.LastSeenTs)
if err != nil {
- sklog.Errorf("Malformed time in bot: %s", err)
+ sklog.Errorf("Malformed last seen time in bot: %s", err)
continue
}
@@ -65,6 +66,28 @@
m2 := metricsClient.GetInt64Metric(MEASUREMENT_SWARM_BOTS_QUARANTINED, tags)
m2.Update(quarantined)
newMetrics = append(newMetrics, m2)
+
+ // Last task performed <duration> ago
+ lastTasks, err := client.ListBotTasks(bot.BotId, 1)
+ if err != nil {
+ sklog.Errorf("Problem getting tasks that bot %s has run: %s", bot.BotId, err)
+ continue
+ }
+ ts := ""
+ if len(lastTasks) == 0 {
+ ts = bot.FirstSeenTs
+ } else {
+ ts = lastTasks[0].ModifiedTs
+ }
+
+ last, err = time.Parse("2006-01-02T15:04:05", ts)
+ if err != nil {
+ sklog.Errorf("Malformed last modified time in bot %s's last task %q: %s", bot.BotId, ts, err)
+ continue
+ }
+ m3 := metricsClient.GetInt64Metric(MEASUREMENT_SWARM_BOTS_LAST_TASK, tags)
+ m3.Update(int64(now.Sub(last)))
+ newMetrics = append(newMetrics, m1)
}
return newMetrics, nil
}
diff --git a/datahopper/go/swarming_metrics/bots_test.go b/datahopper/go/swarming_metrics/bots_test.go
index 82e498d..f1e2c17 100644
--- a/datahopper/go/swarming_metrics/bots_test.go
+++ b/datahopper/go/swarming_metrics/bots_test.go
@@ -12,10 +12,16 @@
"go.skia.org/infra/go/testutils"
swarming_api "github.com/luci/luci-go/common/api/swarming/swarming/v1"
+ "github.com/stretchr/testify/mock"
assert "github.com/stretchr/testify/require"
metrics_util "go.skia.org/infra/go/metrics2/testutils"
)
+const (
+ MOCK_POOL = "SomePool"
+ MOCK_SERVER = "SomeServer"
+)
+
type expectations struct {
botID string
quarantined bool
@@ -66,22 +72,24 @@
LastSeenTs: now.Add(-e.lastSeenDelta).Format("2006-01-02T15:04:05"),
IsDead: e.isDead,
Quarantined: e.quarantined,
+ FirstSeenTs: now.Add(-24 * time.Hour).Format("2006-01-02T15:04:05"),
})
}
- ms.On("ListBotsForPool", "SomePool").Return(b, nil)
+ ms.On("ListBotsForPool", MOCK_POOL).Return(b, nil)
+ ms.On("ListBotTasks", mock.AnythingOfType("string"), 1).Return([]*swarming_api.SwarmingRpcsTaskResult{}, nil)
pc := getPromClient()
- newMetrics, err := reportBotMetrics(now, ms, pc, "SomePool", "SomeServer")
+ newMetrics, err := reportBotMetrics(now, ms, pc, MOCK_POOL, MOCK_SERVER)
assert.NoError(t, err)
- assert.Len(t, newMetrics, 6, "3 bots * 2 metrics each = 6 expected metrics")
+ assert.Len(t, newMetrics, 9, "3 bots * 3 metrics each = 9 expected metrics")
for _, e := range ex {
tags := map[string]string{
"bot": e.botID,
- "pool": "SomePool",
- "swarming": "SomeServer",
+ "pool": MOCK_POOL,
+ "swarming": MOCK_SERVER,
}
// even though this is a (really big) int, JSON notation returns scientific notation
// for large enough ints, which means we need to ParseFloat, the only parser we have
@@ -90,7 +98,7 @@
assert.NoError(t, err)
assert.Equalf(t, int64(e.lastSeenDelta), int64(actual), "Wrong last seen time for metric %s", MEASUREMENT_SWARM_BOTS_LAST_SEEN)
- actual, err = strconv.ParseFloat(metrics_util.GetRecordedMetric(t, MEASUREMENT_SWARM_BOTS_QUARANTINED, tags), 64)
+ actual, err = strconv.ParseFloat(metrics_util.GetRecordedMetric(t, "swarming_bots_quarantined", tags), 64)
assert.NoError(t, err)
expected := 0
if e.quarantined {
@@ -99,3 +107,46 @@
assert.Equalf(t, int64(expected), int64(actual), "Wrong last seen time for metric %s", MEASUREMENT_SWARM_BOTS_QUARANTINED)
}
}
+
+func TestLastTaskBotMetrics(t *testing.T) {
+ testutils.SmallTest(t)
+
+ ms := swarming.NewMockApiClient()
+ defer ms.AssertExpectations(t)
+
+ now := time.Date(2017, 9, 1, 12, 0, 0, 0, time.UTC)
+
+ ms.On("ListBotsForPool", MOCK_POOL).Return([]*swarming_api.SwarmingRpcsBotInfo{
+ {
+ BotId: "my-bot",
+ LastSeenTs: now.Add(-time.Minute).Format("2006-01-02T15:04:05"),
+ IsDead: false,
+ Quarantined: false,
+ },
+ }, nil)
+
+ ms.On("ListBotTasks", "my-bot", 1).Return([]*swarming_api.SwarmingRpcsTaskResult{
+ {
+ ModifiedTs: now.Add(-31 * time.Minute).Format("2006-01-02T15:04:05"),
+ },
+ }, nil)
+
+ pc := getPromClient()
+
+ newMetrics, err := reportBotMetrics(now, ms, pc, MOCK_POOL, MOCK_SERVER)
+ assert.NoError(t, err)
+ assert.Len(t, newMetrics, 3, "1 bot * 3 metrics = 3 expected metrics")
+
+ tags := map[string]string{
+ "bot": "my-bot",
+ "pool": MOCK_POOL,
+ "swarming": MOCK_SERVER,
+ }
+ // even though this is a (really big) int, JSON notation returns scientific notation
+ // for large enough ints, which means we need to ParseFloat, the only parser we have
+ // that can read Scientific notation.
+ actual, err := strconv.ParseFloat(metrics_util.GetRecordedMetric(t, MEASUREMENT_SWARM_BOTS_LAST_TASK, tags), 64)
+ assert.NoError(t, err)
+ assert.Equalf(t, int64(31*time.Minute), int64(actual), "Wrong last seen time for metric %s", MEASUREMENT_SWARM_BOTS_LAST_TASK)
+
+}
diff --git a/go/swarming/apiclient.go b/go/swarming/apiclient.go
index 803f049..a2f6b65 100644
--- a/go/swarming/apiclient.go
+++ b/go/swarming/apiclient.go
@@ -83,6 +83,10 @@
GracefullyShutdownBot(id string) (*swarming.SwarmingRpcsTerminateResponse, error)
+ // ListBotTasks returns a slice of SwarmingRpcsTaskResult that are the last
+ // N tasks done by a bot. When limit is big (>100), this call is very expensive.
+ ListBotTasks(botID string, limit int) ([]*swarming.SwarmingRpcsTaskResult, error)
+
// ListTasks returns a slice of swarming.SwarmingRpcsTaskRequestMetadata
// instances corresponding to the specified tags and within given time window.
// The results will have TaskId, TaskResult, and Request fields populated.
@@ -210,6 +214,25 @@
return c.s.Bot.Terminate(id).Do()
}
+type limitOption struct {
+ limit int
+}
+
+func (l *limitOption) Get() (string, string) {
+ return "limit", strconv.Itoa(l.limit)
+}
+
+func (c *apiClient) ListBotTasks(botID string, limit int) ([]*swarming.SwarmingRpcsTaskResult, error) {
+ // The paramaters for Do() are a list of things that implement the Get() method
+ // which returns a key and a value. This gets turned into key=value on the url
+ // request, which works, even though Limit is not part of the client library.
+ res, err := c.s.Bot.Tasks(botID).Do(&limitOption{limit: 1})
+ if err != nil {
+ return nil, err
+ }
+ return res.Items, nil
+}
+
func (c *apiClient) ListSkiaTasks(start, end time.Time) ([]*swarming.SwarmingRpcsTaskRequestMetadata, error) {
return c.ListTasks(start, end, []string{"pool:Skia"}, "")
}
diff --git a/go/swarming/mock_apiclient.go b/go/swarming/mock_apiclient.go
index f57c041..0b0c578 100644
--- a/go/swarming/mock_apiclient.go
+++ b/go/swarming/mock_apiclient.go
@@ -262,6 +262,22 @@
return r0, r1
}
+// ListBotTasks provides a mock function with given fields: botID, limit
+func (_m *MockApiClient) ListBotTasks(botID string, limit int) ([]*v1.SwarmingRpcsTaskResult, error) {
+ ret := _m.Called(botID, limit)
+
+ var r0 []*v1.SwarmingRpcsTaskResult
+ if rf, ok := ret.Get(0).(func(string, int) []*v1.SwarmingRpcsTaskResult); ok {
+ r0 = rf(botID, limit)
+ } else {
+ if ret.Get(0) != nil {
+ r0 = ret.Get(0).([]*v1.SwarmingRpcsTaskResult)
+ }
+ }
+
+ return r0, ret.Error(1)
+}
+
// ListTasks provides a mock function with given fields: start, end, tags, state
func (_m *MockApiClient) ListTasks(start time.Time, end time.Time, tags []string, state string) ([]*v1.SwarmingRpcsTaskRequestMetadata, error) {
ret := _m.Called(start, end, tags, state)
diff --git a/go/swarming/testutils.go b/go/swarming/testutils.go
index cafef8d..67b774c 100644
--- a/go/swarming/testutils.go
+++ b/go/swarming/testutils.go
@@ -122,6 +122,17 @@
return rv, nil
}
+func (c *TestClient) ListBotTasks(botID string, limit int) ([]*swarming.SwarmingRpcsTaskResult, error) {
+ // For now, just return all tasks in the list. This could probably be better.
+ c.taskListMtx.RLock()
+ defer c.taskListMtx.RUnlock()
+ rv := make([]*swarming.SwarmingRpcsTaskResult, 0, len(c.taskList))
+ for _, t := range c.taskList {
+ rv = append(rv, t.TaskResult)
+ }
+ return rv, nil
+}
+
func (c *TestClient) ListSkiaTasks(start, end time.Time) ([]*swarming.SwarmingRpcsTaskRequestMetadata, error) {
return c.ListTasks(start, end, []string{"pool:Skia"}, "")
}
diff --git a/prometheus/sys/alert.rules b/prometheus/sys/alert.rules
index d90e788..a2032b7 100644
--- a/prometheus/sys/alert.rules
+++ b/prometheus/sys/alert.rules
@@ -256,7 +256,7 @@
# Swarming
ALERT BotMissing
- IF swarming_bots_last_seen{bot!~"ct-gce-.*"}/1024/1024/1024/60 > 15
+ IF swarming_bots_last_seen{bot!~"ct-gce-.*"}/1000/1000/1000/60 > 15
LABELS { category = "infra", severity = "critical"}
ANNOTATIONS {
abbr = "{{ $labels.bot }}",
@@ -270,6 +270,14 @@
description = "1 or more CT GCE bots are down: https://chrome-swarming.appspot.com/botlist?f=status%3Adead&f=gpu%3Anone&f=pool%3ACT&l=100"
}
+ALERT BotUnemployed
+ IF swarming_bots_last_task{pool=~"Skia.*"}/1000/1000/1000/60/60 >= 72
+ LABELS { category = "infra", severity = "critical"}
+ ANNOTATIONS {
+ abbr = "{{ $labels.bot }}",
+ description = "Swarming bot {{ $labels.bot }} hasn't run a job in 72 hours. Maybe it's dimensions need changing? https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
+ }
+
ALERT BotQuarantined
IF avg_over_time(swarming_bots_quarantined[10m]) >= 1
LABELS { category = "infra", severity = "critical"}
@@ -277,7 +285,6 @@
abbr = "{{ $labels.bot }}",
description = "Swarming bot {{ $labels.bot }} is quarantined. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
}
-
# Swarming Logger
ALERT SwarmingLoggerErrorRate