[machine] Schedule pods for restart if they get too old.
Change-Id: Ie9941a12d041dec92061bdcfa8c28bb10742adcf
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/302538
Auto-Submit: Joe Gregorio <jcgregorio@google.com>
Reviewed-by: Kevin Lubick <kjlubick@google.com>
Commit-Queue: Kevin Lubick <kjlubick@google.com>
diff --git a/machine/go/machine/processor/impl.go b/machine/go/machine/processor/impl.go
index 11f4596..0533b02 100644
--- a/machine/go/machine/processor/impl.go
+++ b/machine/go/machine/processor/impl.go
@@ -37,6 +37,11 @@
maxTemperatureC float64 = 35
batteryTemperatureKey = "dumpsys_battery"
+
+ maxPodLifetime = 24 * time.Hour
+
+ // The username for annotations made by the machine server.
+ machineUserName = "machines.skia.org"
)
var (
@@ -170,10 +175,19 @@
ret.Dimensions[k] = values
}
+ // If the pod gets too old we schedule it for deletion.
+ if time.Now().Sub(event.Host.StartTime) > maxPodLifetime && ret.ScheduledForDeletion == "" {
+ ret.ScheduledForDeletion = ret.PodName
+ ret.Annotation.Timestamp = time.Now()
+ ret.Annotation.Message = fmt.Sprintf("Pod too old, requested update for %q", ret.PodName)
+ ret.Annotation.User = machineUserName
+ }
+
// Once a pod has restarted it will have a new podname so clear the deletion.
if ret.ScheduledForDeletion != "" && ret.PodName != ret.ScheduledForDeletion {
ret.ScheduledForDeletion = ""
}
+
// If the machine was quarantined, but hasn't been quarantined this trip
// through Process then take the machine out of quarantine.
if previous.Mode == machine.ModeAvailable && len(previous.Dimensions[machine.DimQuarantined]) != 0 && len(dimensions[machine.DimQuarantined]) == 0 {
diff --git a/machine/go/machine/processor/impl_test.go b/machine/go/machine/processor/impl_test.go
index 3156367..09821e2 100644
--- a/machine/go/machine/processor/impl_test.go
+++ b/machine/go/machine/processor/impl_test.go
@@ -4,6 +4,7 @@
"context"
"strings"
"testing"
+ "time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -435,7 +436,50 @@
assert.Equal(t, "Battery is too low: 9 < 30 (%)", next.Dimensions[machine.DimQuarantined][0])
assert.Equal(t, 9, next.Battery)
assert.Equal(t, int64(9), metrics2.GetInt64Metric("machine_processor_device_battery_level", map[string]string{"machine": "skia-rpi2-0001"}).Get())
+}
+func TestProcess_ScheduleForDeletionIfPodIsTooOld(t *testing.T) {
+ unittest.SmallTest(t)
+ ctx := context.Background()
+
+ previous := machine.NewDescription()
+ event := machine.Event{
+ EventType: machine.EventTypeRawState,
+ Host: machine.Host{
+ Name: "skia-rpi2-0001",
+ StartTime: time.Now().Add(-2 * maxPodLifetime),
+ PodName: "rpi-swarming-123",
+ },
+ Android: machine.Android{},
+ }
+
+ p := newProcessorForTest(t)
+ next := p.Process(ctx, previous, event)
+ assert.Equal(t, next.ScheduledForDeletion, next.PodName)
+ assert.NotEmpty(t, next.PodName)
+ assert.Equal(t, next.Annotation.User, machineUserName)
+ assert.Equal(t, next.Annotation.Message, "Pod too old, requested update for \"rpi-swarming-123\"")
+}
+
+func TestProcess_DoNoScheduleForDeletionIfPodIsntTooOld(t *testing.T) {
+ unittest.SmallTest(t)
+ ctx := context.Background()
+
+ previous := machine.NewDescription()
+ event := machine.Event{
+ EventType: machine.EventTypeRawState,
+ Host: machine.Host{
+ Name: "skia-rpi2-0001",
+ StartTime: time.Now().Add(-1 * maxPodLifetime / 2),
+ PodName: "rpi-swarming-123",
+ },
+ Android: machine.Android{},
+ }
+
+ p := newProcessorForTest(t)
+ next := p.Process(ctx, previous, event)
+ assert.Empty(t, next.ScheduledForDeletion)
+ assert.NotEmpty(t, next.PodName)
}
func TestProcess_QuarantineIfDeviceTooHot(t *testing.T) {