Add alerts when bots don't run a job for 72 hours.

Bug: skia:
Change-Id: Ifd2b987e0a5a4ac7b27f6d99ea4abbfc87d7d6f3
Reviewed-on: https://skia-review.googlesource.com/36400
Reviewed-by: Eric Boren <borenet@google.com>
Commit-Queue: Kevin Lubick <kjlubick@google.com>
diff --git a/datahopper/go/swarming_metrics/bots.go b/datahopper/go/swarming_metrics/bots.go
index b5d9c71..6826f60 100644
--- a/datahopper/go/swarming_metrics/bots.go
+++ b/datahopper/go/swarming_metrics/bots.go
@@ -12,6 +12,7 @@
 const (
 	MEASUREMENT_SWARM_BOTS_LAST_SEEN   = "swarming_bots_last_seen"
 	MEASUREMENT_SWARM_BOTS_QUARANTINED = "swarming_bots_quarantined"
+	MEASUREMENT_SWARM_BOTS_LAST_TASK   = "swarming_bots_last_task"
 )
 
 // cleanupOldMetrics deletes old metrics, replace with new ones. This fixes the case where
@@ -42,7 +43,7 @@
 	for _, bot := range bots {
 		last, err := time.Parse("2006-01-02T15:04:05", bot.LastSeenTs)
 		if err != nil {
-			sklog.Errorf("Malformed time in bot: %s", err)
+			sklog.Errorf("Malformed last seen time in bot: %s", err)
 			continue
 		}
 
@@ -65,6 +66,28 @@
 		m2 := metricsClient.GetInt64Metric(MEASUREMENT_SWARM_BOTS_QUARANTINED, tags)
 		m2.Update(quarantined)
 		newMetrics = append(newMetrics, m2)
+
+		// Last task performed <duration> ago
+		lastTasks, err := client.ListBotTasks(bot.BotId, 1)
+		if err != nil {
+			sklog.Errorf("Problem getting tasks that bot %s has run: %s", bot.BotId, err)
+			continue
+		}
+		ts := ""
+		if len(lastTasks) == 0 {
+			ts = bot.FirstSeenTs
+		} else {
+			ts = lastTasks[0].ModifiedTs
+		}
+
+		last, err = time.Parse("2006-01-02T15:04:05", ts)
+		if err != nil {
+			sklog.Errorf("Malformed last modified time in bot %s's last task %q: %s", bot.BotId, ts, err)
+			continue
+		}
+		m3 := metricsClient.GetInt64Metric(MEASUREMENT_SWARM_BOTS_LAST_TASK, tags)
+		m3.Update(int64(now.Sub(last)))
+		newMetrics = append(newMetrics, m1)
 	}
 	return newMetrics, nil
 }
diff --git a/datahopper/go/swarming_metrics/bots_test.go b/datahopper/go/swarming_metrics/bots_test.go
index 82e498d..f1e2c17 100644
--- a/datahopper/go/swarming_metrics/bots_test.go
+++ b/datahopper/go/swarming_metrics/bots_test.go
@@ -12,10 +12,16 @@
 	"go.skia.org/infra/go/testutils"
 
 	swarming_api "github.com/luci/luci-go/common/api/swarming/swarming/v1"
+	"github.com/stretchr/testify/mock"
 	assert "github.com/stretchr/testify/require"
 	metrics_util "go.skia.org/infra/go/metrics2/testutils"
 )
 
+const (
+	MOCK_POOL   = "SomePool"
+	MOCK_SERVER = "SomeServer"
+)
+
 type expectations struct {
 	botID         string
 	quarantined   bool
@@ -66,22 +72,24 @@
 			LastSeenTs:  now.Add(-e.lastSeenDelta).Format("2006-01-02T15:04:05"),
 			IsDead:      e.isDead,
 			Quarantined: e.quarantined,
+			FirstSeenTs: now.Add(-24 * time.Hour).Format("2006-01-02T15:04:05"),
 		})
 	}
 
-	ms.On("ListBotsForPool", "SomePool").Return(b, nil)
+	ms.On("ListBotsForPool", MOCK_POOL).Return(b, nil)
+	ms.On("ListBotTasks", mock.AnythingOfType("string"), 1).Return([]*swarming_api.SwarmingRpcsTaskResult{}, nil)
 
 	pc := getPromClient()
 
-	newMetrics, err := reportBotMetrics(now, ms, pc, "SomePool", "SomeServer")
+	newMetrics, err := reportBotMetrics(now, ms, pc, MOCK_POOL, MOCK_SERVER)
 	assert.NoError(t, err)
-	assert.Len(t, newMetrics, 6, "3 bots * 2 metrics each = 6 expected metrics")
+	assert.Len(t, newMetrics, 9, "3 bots * 3 metrics each = 9 expected metrics")
 
 	for _, e := range ex {
 		tags := map[string]string{
 			"bot":      e.botID,
-			"pool":     "SomePool",
-			"swarming": "SomeServer",
+			"pool":     MOCK_POOL,
+			"swarming": MOCK_SERVER,
 		}
 		// even though this is a (really big) int, JSON notation returns scientific notation
 		// for large enough ints, which means we need to ParseFloat, the only parser we have
@@ -90,7 +98,7 @@
 		assert.NoError(t, err)
 		assert.Equalf(t, int64(e.lastSeenDelta), int64(actual), "Wrong last seen time for metric %s", MEASUREMENT_SWARM_BOTS_LAST_SEEN)
 
-		actual, err = strconv.ParseFloat(metrics_util.GetRecordedMetric(t, MEASUREMENT_SWARM_BOTS_QUARANTINED, tags), 64)
+		actual, err = strconv.ParseFloat(metrics_util.GetRecordedMetric(t, "swarming_bots_quarantined", tags), 64)
 		assert.NoError(t, err)
 		expected := 0
 		if e.quarantined {
@@ -99,3 +107,46 @@
 		assert.Equalf(t, int64(expected), int64(actual), "Wrong last seen time for metric %s", MEASUREMENT_SWARM_BOTS_QUARANTINED)
 	}
 }
+
+func TestLastTaskBotMetrics(t *testing.T) {
+	testutils.SmallTest(t)
+
+	ms := swarming.NewMockApiClient()
+	defer ms.AssertExpectations(t)
+
+	now := time.Date(2017, 9, 1, 12, 0, 0, 0, time.UTC)
+
+	ms.On("ListBotsForPool", MOCK_POOL).Return([]*swarming_api.SwarmingRpcsBotInfo{
+		{
+			BotId:       "my-bot",
+			LastSeenTs:  now.Add(-time.Minute).Format("2006-01-02T15:04:05"),
+			IsDead:      false,
+			Quarantined: false,
+		},
+	}, nil)
+
+	ms.On("ListBotTasks", "my-bot", 1).Return([]*swarming_api.SwarmingRpcsTaskResult{
+		{
+			ModifiedTs: now.Add(-31 * time.Minute).Format("2006-01-02T15:04:05"),
+		},
+	}, nil)
+
+	pc := getPromClient()
+
+	newMetrics, err := reportBotMetrics(now, ms, pc, MOCK_POOL, MOCK_SERVER)
+	assert.NoError(t, err)
+	assert.Len(t, newMetrics, 3, "1 bot * 3 metrics = 3 expected metrics")
+
+	tags := map[string]string{
+		"bot":      "my-bot",
+		"pool":     MOCK_POOL,
+		"swarming": MOCK_SERVER,
+	}
+	// even though this is a (really big) int, JSON notation returns scientific notation
+	// for large enough ints, which means we need to ParseFloat, the only parser we have
+	// that can read Scientific notation.
+	actual, err := strconv.ParseFloat(metrics_util.GetRecordedMetric(t, MEASUREMENT_SWARM_BOTS_LAST_TASK, tags), 64)
+	assert.NoError(t, err)
+	assert.Equalf(t, int64(31*time.Minute), int64(actual), "Wrong last seen time for metric %s", MEASUREMENT_SWARM_BOTS_LAST_TASK)
+
+}
diff --git a/go/swarming/apiclient.go b/go/swarming/apiclient.go
index 803f049..a2f6b65 100644
--- a/go/swarming/apiclient.go
+++ b/go/swarming/apiclient.go
@@ -83,6 +83,10 @@
 
 	GracefullyShutdownBot(id string) (*swarming.SwarmingRpcsTerminateResponse, error)
 
+	// ListBotTasks returns a slice of SwarmingRpcsTaskResult that are the last
+	// N tasks done by a bot. When limit is big (>100), this call is very expensive.
+	ListBotTasks(botID string, limit int) ([]*swarming.SwarmingRpcsTaskResult, error)
+
 	// ListTasks returns a slice of swarming.SwarmingRpcsTaskRequestMetadata
 	// instances corresponding to the specified tags and within given time window.
 	// The results will have TaskId, TaskResult, and Request fields populated.
@@ -210,6 +214,25 @@
 	return c.s.Bot.Terminate(id).Do()
 }
 
+type limitOption struct {
+	limit int
+}
+
+func (l *limitOption) Get() (string, string) {
+	return "limit", strconv.Itoa(l.limit)
+}
+
+func (c *apiClient) ListBotTasks(botID string, limit int) ([]*swarming.SwarmingRpcsTaskResult, error) {
+	// The paramaters for Do() are a list of things that implement the Get() method
+	// which returns a key and a value. This gets turned into key=value on the url
+	// request, which works, even though Limit is not part of the client library.
+	res, err := c.s.Bot.Tasks(botID).Do(&limitOption{limit: 1})
+	if err != nil {
+		return nil, err
+	}
+	return res.Items, nil
+}
+
 func (c *apiClient) ListSkiaTasks(start, end time.Time) ([]*swarming.SwarmingRpcsTaskRequestMetadata, error) {
 	return c.ListTasks(start, end, []string{"pool:Skia"}, "")
 }
diff --git a/go/swarming/mock_apiclient.go b/go/swarming/mock_apiclient.go
index f57c041..0b0c578 100644
--- a/go/swarming/mock_apiclient.go
+++ b/go/swarming/mock_apiclient.go
@@ -262,6 +262,22 @@
 	return r0, r1
 }
 
+// ListBotTasks provides a mock function with given fields: botID, limit
+func (_m *MockApiClient) ListBotTasks(botID string, limit int) ([]*v1.SwarmingRpcsTaskResult, error) {
+	ret := _m.Called(botID, limit)
+
+	var r0 []*v1.SwarmingRpcsTaskResult
+	if rf, ok := ret.Get(0).(func(string, int) []*v1.SwarmingRpcsTaskResult); ok {
+		r0 = rf(botID, limit)
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).([]*v1.SwarmingRpcsTaskResult)
+		}
+	}
+
+	return r0, ret.Error(1)
+}
+
 // ListTasks provides a mock function with given fields: start, end, tags, state
 func (_m *MockApiClient) ListTasks(start time.Time, end time.Time, tags []string, state string) ([]*v1.SwarmingRpcsTaskRequestMetadata, error) {
 	ret := _m.Called(start, end, tags, state)
diff --git a/go/swarming/testutils.go b/go/swarming/testutils.go
index cafef8d..67b774c 100644
--- a/go/swarming/testutils.go
+++ b/go/swarming/testutils.go
@@ -122,6 +122,17 @@
 	return rv, nil
 }
 
+func (c *TestClient) ListBotTasks(botID string, limit int) ([]*swarming.SwarmingRpcsTaskResult, error) {
+	// For now, just return all tasks in the list.  This could probably be better.
+	c.taskListMtx.RLock()
+	defer c.taskListMtx.RUnlock()
+	rv := make([]*swarming.SwarmingRpcsTaskResult, 0, len(c.taskList))
+	for _, t := range c.taskList {
+		rv = append(rv, t.TaskResult)
+	}
+	return rv, nil
+}
+
 func (c *TestClient) ListSkiaTasks(start, end time.Time) ([]*swarming.SwarmingRpcsTaskRequestMetadata, error) {
 	return c.ListTasks(start, end, []string{"pool:Skia"}, "")
 }
diff --git a/prometheus/sys/alert.rules b/prometheus/sys/alert.rules
index d90e788..a2032b7 100644
--- a/prometheus/sys/alert.rules
+++ b/prometheus/sys/alert.rules
@@ -256,7 +256,7 @@
 # Swarming
 
 ALERT BotMissing
-  IF swarming_bots_last_seen{bot!~"ct-gce-.*"}/1024/1024/1024/60 > 15
+  IF swarming_bots_last_seen{bot!~"ct-gce-.*"}/1000/1000/1000/60 > 15
   LABELS { category = "infra", severity = "critical"}
   ANNOTATIONS {
     abbr = "{{ $labels.bot }}",
@@ -270,6 +270,14 @@
     description = "1 or more CT GCE bots are down: https://chrome-swarming.appspot.com/botlist?f=status%3Adead&f=gpu%3Anone&f=pool%3ACT&l=100"
   }
 
+ALERT BotUnemployed
+  IF swarming_bots_last_task{pool=~"Skia.*"}/1000/1000/1000/60/60 >= 72
+  LABELS { category = "infra", severity = "critical"}
+  ANNOTATIONS {
+    abbr = "{{ $labels.bot }}",
+    description = "Swarming bot {{ $labels.bot }} hasn't run a job in 72 hours. Maybe it's dimensions need changing? https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
+  }
+
 ALERT BotQuarantined
   IF avg_over_time(swarming_bots_quarantined[10m]) >= 1
   LABELS { category = "infra", severity = "critical"}
@@ -277,7 +285,6 @@
     abbr = "{{ $labels.bot }}",
     description = "Swarming bot {{ $labels.bot }} is quarantined. https://{{ $labels.swarming }}/bot?id={{ $labels.bot }} https://goto.google.com/skolo-maintenance"
   }
-
 # Swarming Logger
 
 ALERT SwarmingLoggerErrorRate