[machine] Schedule pods for restart if they get too old.

Change-Id: Ie9941a12d041dec92061bdcfa8c28bb10742adcf
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/302538
Auto-Submit: Joe Gregorio <jcgregorio@google.com>
Reviewed-by: Kevin Lubick <kjlubick@google.com>
Commit-Queue: Kevin Lubick <kjlubick@google.com>
diff --git a/machine/go/machine/processor/impl.go b/machine/go/machine/processor/impl.go
index 11f4596..0533b02 100644
--- a/machine/go/machine/processor/impl.go
+++ b/machine/go/machine/processor/impl.go
@@ -37,6 +37,11 @@
 	maxTemperatureC float64 = 35
 
 	batteryTemperatureKey = "dumpsys_battery"
+
+	maxPodLifetime = 24 * time.Hour
+
+	// The username for annotations made by the machine server.
+	machineUserName = "machines.skia.org"
 )
 
 var (
@@ -170,10 +175,19 @@
 		ret.Dimensions[k] = values
 	}
 
+	// If the pod gets too old we schedule it for deletion.
+	if time.Now().Sub(event.Host.StartTime) > maxPodLifetime && ret.ScheduledForDeletion == "" {
+		ret.ScheduledForDeletion = ret.PodName
+		ret.Annotation.Timestamp = time.Now()
+		ret.Annotation.Message = fmt.Sprintf("Pod too old, requested update for %q", ret.PodName)
+		ret.Annotation.User = machineUserName
+	}
+
 	// Once a pod has restarted it will have a new podname so clear the deletion.
 	if ret.ScheduledForDeletion != "" && ret.PodName != ret.ScheduledForDeletion {
 		ret.ScheduledForDeletion = ""
 	}
+
 	// If the machine was quarantined, but hasn't been quarantined this trip
 	// through Process then take the machine out of quarantine.
 	if previous.Mode == machine.ModeAvailable && len(previous.Dimensions[machine.DimQuarantined]) != 0 && len(dimensions[machine.DimQuarantined]) == 0 {
diff --git a/machine/go/machine/processor/impl_test.go b/machine/go/machine/processor/impl_test.go
index 3156367..09821e2 100644
--- a/machine/go/machine/processor/impl_test.go
+++ b/machine/go/machine/processor/impl_test.go
@@ -4,6 +4,7 @@
 	"context"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -435,7 +436,50 @@
 	assert.Equal(t, "Battery is too low: 9 < 30 (%)", next.Dimensions[machine.DimQuarantined][0])
 	assert.Equal(t, 9, next.Battery)
 	assert.Equal(t, int64(9), metrics2.GetInt64Metric("machine_processor_device_battery_level", map[string]string{"machine": "skia-rpi2-0001"}).Get())
+}
 
+func TestProcess_ScheduleForDeletionIfPodIsTooOld(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := context.Background()
+
+	previous := machine.NewDescription()
+	event := machine.Event{
+		EventType: machine.EventTypeRawState,
+		Host: machine.Host{
+			Name:      "skia-rpi2-0001",
+			StartTime: time.Now().Add(-2 * maxPodLifetime),
+			PodName:   "rpi-swarming-123",
+		},
+		Android: machine.Android{},
+	}
+
+	p := newProcessorForTest(t)
+	next := p.Process(ctx, previous, event)
+	assert.Equal(t, next.ScheduledForDeletion, next.PodName)
+	assert.NotEmpty(t, next.PodName)
+	assert.Equal(t, next.Annotation.User, machineUserName)
+	assert.Equal(t, next.Annotation.Message, "Pod too old, requested update for \"rpi-swarming-123\"")
+}
+
+func TestProcess_DoNoScheduleForDeletionIfPodIsntTooOld(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := context.Background()
+
+	previous := machine.NewDescription()
+	event := machine.Event{
+		EventType: machine.EventTypeRawState,
+		Host: machine.Host{
+			Name:      "skia-rpi2-0001",
+			StartTime: time.Now().Add(-1 * maxPodLifetime / 2),
+			PodName:   "rpi-swarming-123",
+		},
+		Android: machine.Android{},
+	}
+
+	p := newProcessorForTest(t)
+	next := p.Process(ctx, previous, event)
+	assert.Empty(t, next.ScheduledForDeletion)
+	assert.NotEmpty(t, next.PodName)
 }
 
 func TestProcess_QuarantineIfDeviceTooHot(t *testing.T) {