[machines] adb reconnect if an Android device goes offline.

Android device can go 'offline', but running 'adb reconnect offline' may
reconnect them, so add that as a new step to adb.go.

The added EnsureOnline() checks the status of the connection and tries
to reconnect if the device is offline, otherwise it reports the state
of the device.

Change-Id: Ide7e9dd92951f313c9898f6dec3b4d5ca1206347
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/448466
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
Reviewed-by: Kevin Lubick <kjlubick@google.com>
diff --git a/machine/go/test_machine_monitor/adb/adb.go b/machine/go/test_machine_monitor/adb/adb.go
index e471caf..1b3cba8 100644
--- a/machine/go/test_machine_monitor/adb/adb.go
+++ b/machine/go/test_machine_monitor/adb/adb.go
@@ -26,6 +26,11 @@
 
 // Adb is the interface that AdbImpl provides.
 type Adb interface {
+	// EnsureOnline returns nil if the Android device is online and ready to
+	// run, otherwise it will try to bring it back online and if that fails will
+	// return an error.
+	EnsureOnline(ctx context.Context) error
+
 	// RawProperties returns the unfiltered output of running "adb shell getprop".
 	RawProperties(ctx context.Context) (string, error)
 
@@ -39,71 +44,109 @@
 	Uptime(ctx context.Context) (time.Duration, error)
 }
 
-// RawProperties implements the Adb interface.
-func (a AdbImpl) RawProperties(ctx context.Context) (string, error) {
+// adbCommand sends a single adb command to the device then captures and returns
+// both the stdout and stderr of the results.
+func (a AdbImpl) adbCommand(ctx context.Context, args ...string) (string, string, error) {
 	ctx, cancel := context.WithTimeout(ctx, commandTimeout)
 	defer cancel()
-	cmd := executil.CommandContext(ctx, "adb", "shell", "getprop")
+	cmd := executil.CommandContext(ctx, "adb", args...)
 
 	b, err := cmd.Output()
 	if err != nil {
+		stderr := ""
 		if ee, ok := err.(*exec.ExitError); ok {
-			err = skerr.Wrapf(err, "adb failed with stderr: %q", ee.Stderr)
+			stderr = string(ee.Stderr)
+			err = skerr.Wrapf(err, "adb %s: failed with stderr: %q", strings.Join(args, " "), ee.Stderr)
 		}
+		return "", stderr, skerr.Wrap(err)
+	}
+	return string(b), "", nil
+}
+
+// getState returns an error if the device is in an error state or it can't be
+// queried.
+//
+// The returned string will contain the text of error: message, excluding the
+// 'error: ' part if the error was from an exit code, otherwise the empty string
+// is returned.
+func (a AdbImpl) getState(ctx context.Context) (string, error) {
+	_, stderr, err := a.adbCommand(ctx, "get-state")
+	if err != nil {
+		return strings.TrimPrefix(stderr, "error:"), skerr.Wrap(err)
+	}
+	return "", nil
+}
+
+// EnsureOnline implements the Adb interface.
+func (a AdbImpl) EnsureOnline(ctx context.Context) error {
+	// Run `adb get-state` it will return a non-zero exit code and emit a string of the form:
+	//
+	//     error: device offline
+	//     error: device unauthorized
+	//     error: no devices/emulators found
+	//
+	// If the device is connected and ready to go it will return a 0 exit code and print:
+	//
+	//     device
+	//
+	// If offline we should run:
+	//
+	//     adb reconnect offline
+	//
+	// Which should reconnect the device.
+	state, err := a.getState(ctx)
+	if err == nil {
+		return nil
+	}
+	if !strings.Contains(state, "offline") {
+		return skerr.Wrapf(err, "adb returned an error state we can't do anything about: %q", state)
+	}
+
+	_, _, err = a.adbCommand(ctx, "reconnect", "offline")
+	if err != nil {
+		return skerr.Wrap(err)
+	}
+
+	_, err = a.getState(ctx)
+
+	return skerr.Wrap(err)
+}
+
+// RawProperties implements the Adb interface.
+func (a AdbImpl) RawProperties(ctx context.Context) (string, error) {
+	stdout, _, err := a.adbCommand(ctx, "shell", "getprop")
+	if err != nil {
 		return "", err
 	}
-	return string(b), nil
+	return stdout, nil
 }
 
 // RawDumpSys implements the Adb interface.
 func (a AdbImpl) RawDumpSys(ctx context.Context, service string) (string, error) {
-	ctx, cancel := context.WithTimeout(ctx, commandTimeout)
-	defer cancel()
-	cmd := executil.CommandContext(ctx, "adb", "shell", "dumpsys", service)
-
-	b, err := cmd.Output()
+	stdout, _, err := a.adbCommand(ctx, "shell", "dumpsys", service)
 	if err != nil {
-		if ee, ok := err.(*exec.ExitError); ok {
-			err = skerr.Wrapf(err, "adb failed with stderr: %q", ee.Stderr)
-		}
 		return "", err
 	}
-	return string(b), nil
+	return stdout, nil
 }
 
 // Reboot implements the Adb interface.
 func (a AdbImpl) Reboot(ctx context.Context) error {
-	ctx, cancel := context.WithTimeout(ctx, commandTimeout)
-	defer cancel()
-	cmd := executil.CommandContext(ctx, "adb", "reboot")
-
-	_, err := cmd.Output()
-	if err != nil {
-		if ee, ok := err.(*exec.ExitError); ok {
-			err = skerr.Wrapf(err, "adb reboot with stderr: %q", ee.Stderr)
-		}
-		return err
-	}
-	return nil
+	_, _, err := a.adbCommand(ctx, "reboot")
+	return err
 }
 
 // Uptime implements the Adb interface.
 func (a AdbImpl) Uptime(ctx context.Context) (time.Duration, error) {
-	ctx, cancel := context.WithTimeout(ctx, commandTimeout)
-	defer cancel()
-	cmd := executil.CommandContext(ctx, "adb", "shell", "cat", "/proc/uptime")
-
-	b, err := cmd.Output()
+	stdout, _, err := a.adbCommand(ctx, "shell", "cat", "/proc/uptime")
 	if err != nil {
-		if ee, ok := err.(*exec.ExitError); ok {
-			err = skerr.Wrapf(err, "adb failed with stderr: %q", ee.Stderr)
-		}
 		return 0, err
 	}
+
 	// The contents of /proc/uptime are the uptime in seconds, followed by the
 	// idle time of all the cores.
 	// https://en.wikipedia.org/wiki/Uptime#Using_/proc/uptime
-	uptimeAsString := string(b)
+	uptimeAsString := stdout
 	parts := strings.Split(uptimeAsString, " ")
 	if len(parts) != 2 {
 		return 0, skerr.Fmt("Found invalid format for /proc/uptime: %q", uptimeAsString)
diff --git a/machine/go/test_machine_monitor/adb/adb_test.go b/machine/go/test_machine_monitor/adb/adb_test.go
index 53f1d6d..55aecea 100644
--- a/machine/go/test_machine_monitor/adb/adb_test.go
+++ b/machine/go/test_machine_monitor/adb/adb_test.go
@@ -202,6 +202,87 @@
 	assert.Contains(t, err.Error(), "error: no devices/emulators found")
 }
 
+func TestGetState_HappyPath_Success(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext("Test_FakeExe_AdbGetState_Success")
+
+	a := New()
+	state, err := a.getState(ctx)
+	require.NoError(t, err)
+	assert.Empty(t, state)
+}
+
+func TestGetState_Offline_ErrOnOffline(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext("Test_FakeExe_AdbGetState_Offline")
+
+	a := New()
+	state, err := a.getState(ctx)
+	require.Error(t, err)
+	assert.Contains(t, state, "offline")
+}
+
+func TestGetState_Offline_ErrOnNoDeviceWithNoEmptyReturned(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext("Test_FakeExe_AdbGetState_NoDevice")
+
+	a := New()
+	state, err := a.getState(ctx)
+	require.Error(t, err)
+	assert.Contains(t, state, "no devices/emulators found")
+}
+
+func TestEnsureOnline_HappyPath_Success(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext("Test_FakeExe_AdbGetState_Success")
+
+	a := New()
+	err := a.EnsureOnline(ctx)
+	require.NoError(t, err)
+}
+
+func TestEnsureOnline_Unauthorized_ErrOnUnauthorized(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext("Test_FakeExe_AdbGetState_NoDevice")
+
+	a := New()
+	err := a.EnsureOnline(ctx)
+	require.Contains(t, err.Error(), "adb returned an error state we can't do anything about:")
+}
+
+func TestEnsureOnline_OfflineAndReconnectWorks_Success(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_AdbGetState_Success",
+		"Test_FakeExe_AdbGetState_Offline",
+		"Test_FakeExe_ReconnectOffline_Success",
+	)
+
+	a := New()
+	err := a.EnsureOnline(ctx)
+	require.NoError(t, err)
+}
+
+func TestEnsureOnline_OfflineAndReconnectFails_ReturnsError(t *testing.T) {
+	unittest.SmallTest(t)
+
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_AdbGetState_Offline",
+		"Test_FakeExe_ReconnectOffline_NoDevice",
+		"Test_FakeExe_AdbGetState_Offline",
+	)
+
+	a := New()
+	err := a.EnsureOnline(ctx)
+	require.Contains(t, err.Error(), "adb get-state: failed with stderr")
+}
+
 func Test_FakeExe_RawDumpSys_NonZeroExitCode(t *testing.T) {
 	unittest.FakeExeTest(t)
 	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
@@ -280,3 +361,73 @@
 
 	os.Exit(nonZeroExitCode)
 }
+
+func Test_FakeExe_AdbGetState_Success(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Equal(t, []string{"adb", "get-state"}, args)
+	fmt.Fprintf(os.Stderr, "device")
+
+	// Force exit so we don't get PASS in the output.
+	os.Exit(0)
+}
+
+func Test_FakeExe_AdbGetState_Offline(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Equal(t, []string{"adb", "get-state"}, args)
+	fmt.Fprintf(os.Stderr, "error: device offline")
+	os.Exit(1)
+}
+
+func Test_FakeExe_AdbGetState_NoDevice(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Equal(t, []string{"adb", "get-state"}, args)
+	fmt.Fprintf(os.Stderr, "error: no devices/emulators found")
+
+	os.Exit(1)
+}
+
+func Test_FakeExe_ReconnectOffline_Success(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Equal(t, []string{"adb", "reconnect", "offline"}, args)
+
+	// Force exit so we don't get PASS in the output.
+	os.Exit(0)
+}
+
+func Test_FakeExe_ReconnectOffline_NoDevice(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Equal(t, []string{"adb", "reconnect", "offline"}, args)
+	// adb reconnect always exits with status code 0, even if it failed to
+	// reconnect.
+	os.Exit(0)
+}
diff --git a/machine/go/test_machine_monitor/machine/machine.go b/machine/go/test_machine_monitor/machine/machine.go
index 19c513b..1f73a28 100644
--- a/machine/go/test_machine_monitor/machine/machine.go
+++ b/machine/go/test_machine_monitor/machine/machine.go
@@ -245,6 +245,11 @@
 // (which can be partially filled out). If there is not a device attached, false is returned.
 func (m *Machine) tryInterrogatingAndroidDevice(ctx context.Context) (machine.Android, bool) {
 	var ret machine.Android
+
+	if err := m.adb.EnsureOnline(ctx); err != nil {
+		sklog.Warningf("No Android device is available: %s", err)
+		return ret, false
+	}
 	if uptime, err := m.adb.Uptime(ctx); err != nil {
 		sklog.Warningf("Failed to read uptime - assuming there is no Android device attached: %s", err)
 		return ret, false // Assume there is no Android device attached.
diff --git a/machine/go/test_machine_monitor/machine/machine_test.go b/machine/go/test_machine_monitor/machine/machine_test.go
index 31fe509..7073e7d 100644
--- a/machine/go/test_machine_monitor/machine/machine_test.go
+++ b/machine/go/test_machine_monitor/machine/machine_test.go
@@ -52,6 +52,7 @@
 func TestTryInterrogatingAndroidDevice_DeviceAttached_Success(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_AdbGetState_Success",
 		"Test_FakeExe_ADBUptime_ReturnsPlaceholder",
 		"Test_FakeExe_AdbShellGetProp_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysBattery_ReturnsPlaceholder",
@@ -83,6 +84,7 @@
 func TestTryInterrogatingAndroidDevice_ThermalFails_PartialSuccess(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_AdbGetState_Success",
 		"Test_FakeExe_ADBUptime_ReturnsPlaceholder",
 		"Test_FakeExe_AdbShellGetProp_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysBattery_ReturnsPlaceholder",
@@ -201,6 +203,7 @@
 func TestInterrogate_AndroidDeviceAttached_Success(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_AdbGetState_Success",
 		"Test_FakeExe_ADBUptime_ReturnsPlaceholder",
 		"Test_FakeExe_AdbShellGetProp_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysBattery_ReturnsPlaceholder",
@@ -389,3 +392,18 @@
 
 	os.Exit(1)
 }
+
+func Test_FakeExe_AdbGetState_Success(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Equal(t, []string{"adb", "get-state"}, args)
+	fmt.Fprintf(os.Stderr, "device")
+
+	// Force exit so we don't get PASS in the output.
+	os.Exit(0)
+}