[tmm] Add support for talking to ChromeOS over SSH.

This ports the logic from https://chrome-internal.googlesource.com/infradata/config/+/1f7926b4420596666f4404092cd7c05b7b3a727c/configs/chromium-swarm/scripts/skia_mobile.py#619

Change-Id: I207924a83c86be53ed4c747688678b39c5b452ad
BUG: skia:12401
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/447765
Commit-Queue: Kevin Lubick <kjlubick@google.com>
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
diff --git a/machine/go/test_machine_monitor/machine/BUILD.bazel b/machine/go/test_machine_monitor/machine/BUILD.bazel
index c54975b..58e4acd 100644
--- a/machine/go/test_machine_monitor/machine/BUILD.bazel
+++ b/machine/go/test_machine_monitor/machine/BUILD.bazel
@@ -17,6 +17,7 @@
         "//machine/go/machine/store",
         "//machine/go/machineserver/config",
         "//machine/go/test_machine_monitor/adb",
+        "//machine/go/test_machine_monitor/ssh",
         "//machine/go/test_machine_monitor/swarming",
     ],
 )
@@ -38,6 +39,7 @@
         "//machine/go/machine/source/pubsubsource",
         "//machine/go/machineserver/config",
         "//machine/go/test_machine_monitor/adb",
+        "//machine/go/test_machine_monitor/ssh",
         "//machine/go/test_machine_monitor/swarming",
         "@com_github_stretchr_testify//assert",
         "@com_github_stretchr_testify//require",
diff --git a/machine/go/test_machine_monitor/machine/machine.go b/machine/go/test_machine_monitor/machine/machine.go
index c90bfd6..19c513b 100644
--- a/machine/go/test_machine_monitor/machine/machine.go
+++ b/machine/go/test_machine_monitor/machine/machine.go
@@ -4,6 +4,9 @@
 import (
 	"context"
 	"os"
+	"regexp"
+	"strconv"
+	"strings"
 	"sync"
 	"time"
 
@@ -17,6 +20,7 @@
 	"go.skia.org/infra/machine/go/machine/store"
 	"go.skia.org/infra/machine/go/machineserver/config"
 	"go.skia.org/infra/machine/go/test_machine_monitor/adb"
+	"go.skia.org/infra/machine/go/test_machine_monitor/ssh"
 	"go.skia.org/infra/machine/go/test_machine_monitor/swarming"
 )
 
@@ -35,6 +39,9 @@
 	// adb makes calls to the adb server.
 	adb adb.Adb
 
+	// ssh is an abstraction around an ssh executor
+	ssh ssh.SSH
+
 	// MachineID is the swarming id of the machine.
 	MachineID string
 
@@ -239,7 +246,7 @@
 func (m *Machine) tryInterrogatingAndroidDevice(ctx context.Context) (machine.Android, bool) {
 	var ret machine.Android
 	if uptime, err := m.adb.Uptime(ctx); err != nil {
-		sklog.Warningf("Failed to read uptime - assuming there is no device attached: %s", err)
+		sklog.Warningf("Failed to read uptime - assuming there is no Android device attached: %s", err)
 		return ret, false // Assume there is no Android device attached.
 	} else {
 		ret.Uptime = uptime
@@ -270,7 +277,46 @@
 	return machine.IOS{}, false
 }
 
-func (m *Machine) tryInterrogatingChromeOSDevice(_ context.Context) (machine.ChromeOS, bool) {
-	// TODO(kjlubick)
-	return machine.ChromeOS{}, false
+var (
+	chromeOSReleaseRegex   = regexp.MustCompile(`CHROMEOS_RELEASE_VERSION=(\S+)`)
+	chromeOSMilestoneRegex = regexp.MustCompile(`CHROMEOS_RELEASE_CHROME_MILESTONE=(\S+)`)
+	chromeOSTrackRegex     = regexp.MustCompile(`CHROMEOS_RELEASE_TRACK=(\S+)`)
+)
+
+func (m *Machine) tryInterrogatingChromeOSDevice(ctx context.Context) (machine.ChromeOS, bool) {
+	if m.description.SSHUserIP == "" {
+		return machine.ChromeOS{}, false
+	}
+	lsbReleaseContents, err := m.ssh.Run(ctx, m.description.SSHUserIP, "cat", "/etc/lsb-release")
+	if err != nil {
+		sklog.Warningf("Failed to read lsb-release - assuming there is no ChromeOS device attached: %s", err)
+		return machine.ChromeOS{}, false
+	}
+	rv := machine.ChromeOS{}
+	if match := chromeOSReleaseRegex.FindStringSubmatch(lsbReleaseContents); match != nil {
+		rv.ReleaseVersion = match[1]
+	}
+	if match := chromeOSMilestoneRegex.FindStringSubmatch(lsbReleaseContents); match != nil {
+		rv.Milestone = match[1]
+	}
+	if match := chromeOSTrackRegex.FindStringSubmatch(lsbReleaseContents); match != nil {
+		rv.Channel = match[1]
+	}
+	if rv.ReleaseVersion == "" && rv.Milestone == "" && rv.Channel == "" {
+		sklog.Errorf("Could not find ChromeOS data in /etc/lsb-release. Are we sure this is the right IP?\n%s", lsbReleaseContents)
+		return machine.ChromeOS{}, false
+	}
+
+	uptime, err := m.ssh.Run(ctx, m.description.SSHUserIP, "cat", "/proc/uptime")
+	if err != nil {
+		sklog.Warningf("Could not read ChromeOS uptime %s", err)
+	} else {
+		u := strings.Split(uptime, " ")[0]
+		if f, err := strconv.ParseFloat(u, 64); err != nil {
+			sklog.Warningf("Invalid /proc/uptime format: %q", uptime)
+		} else {
+			rv.Uptime = time.Duration(f * float64(time.Second))
+		}
+	}
+	return rv, true
 }
diff --git a/machine/go/test_machine_monitor/machine/machine_test.go b/machine/go/test_machine_monitor/machine/machine_test.go
index ee4aaf1..31fe509 100644
--- a/machine/go/test_machine_monitor/machine/machine_test.go
+++ b/machine/go/test_machine_monitor/machine/machine_test.go
@@ -14,6 +14,7 @@
 	"go.skia.org/infra/go/testutils/unittest"
 	"go.skia.org/infra/machine/go/machine"
 	"go.skia.org/infra/machine/go/test_machine_monitor/adb"
+	"go.skia.org/infra/machine/go/test_machine_monitor/ssh"
 )
 
 const (
@@ -23,12 +24,35 @@
 	dumpSysThermalPlaceholder = "Placeholder dumpsys thermal response"
 	// This formatting matters because it is processed in adb.go
 	adbUptimePlaceholder = "123.4 567.8"
+
+	testUserIP = "root@skia-foobar-01"
+	// This example was taken directly from a production ChromeOS device
+	sampleChromeOSlsbrelease = `CHROMEOS_DEVSERVER=
+CHROMEOS_RELEASE_APPID={9A3BE5D2-C3DC-4AE6-9943-E2C113895DC5}
+CHROMEOS_RELEASE_BOARD=octopus-signed-mp-v23keys
+CHROMEOS_RELEASE_BRANCH_NUMBER=56
+CHROMEOS_RELEASE_BUILDER_PATH=octopus-release/R89-13729.56.0
+CHROMEOS_RELEASE_BUILD_NUMBER=13729
+CHROMEOS_RELEASE_BUILD_TYPE=Official Build
+CHROMEOS_RELEASE_CHROME_MILESTONE=89
+CHROMEOS_RELEASE_DESCRIPTION=13729.56.0 (Official Build) stable-channel octopus
+CHROMEOS_RELEASE_KEYSET=mp-v23
+CHROMEOS_RELEASE_NAME=Chrome OS
+CHROMEOS_RELEASE_PATCH_NUMBER=0
+CHROMEOS_RELEASE_TRACK=stable-channel
+CHROMEOS_RELEASE_UNIBUILD=1
+CHROMEOS_RELEASE_VERSION=13729.56.0
+DEVICETYPE=CHROMEBOOK
+GOOGLE_RELEASE=13729.56.0`
+
+	// This example was taken directly from a production ChromeOS device
+	sampleChromeOSUptime = "1234.5 5678.9"
 )
 
 func TestTryInterrogatingAndroidDevice_DeviceAttached_Success(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
-		"Test_FakeExe_Uptime_ReturnsPlaceholder",
+		"Test_FakeExe_ADBUptime_ReturnsPlaceholder",
 		"Test_FakeExe_AdbShellGetProp_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysBattery_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysThermal_ReturnsPlaceholder",
@@ -59,7 +83,7 @@
 func TestTryInterrogatingAndroidDevice_ThermalFails_PartialSuccess(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
-		"Test_FakeExe_Uptime_ReturnsPlaceholder",
+		"Test_FakeExe_ADBUptime_ReturnsPlaceholder",
 		"Test_FakeExe_AdbShellGetProp_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysBattery_ReturnsPlaceholder",
 		"Test_FakeExe_ExitCodeOne",
@@ -75,10 +99,77 @@
 	}, actual)
 }
 
+func TestTryInterrogatingChromeOS_DeviceReachable_Success(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_SSHLSBRelease_ReturnsPlaceholder",
+		"Test_FakeExe_SSHUptime_ReturnsPlaceholder",
+	)
+
+	m := &Machine{ssh: ssh.ExeImpl{}, description: machine.Description{SSHUserIP: testUserIP}}
+	actual, ok := m.tryInterrogatingChromeOSDevice(ctx)
+	assert.True(t, ok)
+	assert.Equal(t, machine.ChromeOS{
+		Channel:        "stable-channel",
+		Milestone:      "89",
+		ReleaseVersion: "13729.56.0",
+		Uptime:         1234500 * time.Millisecond,
+	}, actual)
+}
+
+func TestTryInterrogatingChromeOS_CatLSBReleaseFails_DeviceConsideredUnattached(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_ExitCodeOne",
+	)
+
+	m := &Machine{ssh: ssh.ExeImpl{}, description: machine.Description{SSHUserIP: testUserIP}}
+	_, ok := m.tryInterrogatingChromeOSDevice(ctx)
+	assert.False(t, ok)
+}
+
+func TestTryInterrogatingChromeOS_NoSSHUserIP_ReturnFalse(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := executil.FakeTestsContext() // Any exe call will panic
+
+	m := &Machine{ssh: ssh.ExeImpl{}}
+	_, ok := m.tryInterrogatingChromeOSDevice(ctx)
+	assert.False(t, ok)
+}
+
+func TestTryInterrogatingChromeOS_PartialData_PartialSuccess(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_SSHLSBRelease_ReturnsPlaceholder",
+		"Test_FakeExe_ExitCodeOne", // pretend uptime fails
+	)
+
+	m := &Machine{ssh: ssh.ExeImpl{}, description: machine.Description{SSHUserIP: testUserIP}}
+	actual, ok := m.tryInterrogatingChromeOSDevice(ctx)
+	assert.True(t, ok)
+	assert.Equal(t, machine.ChromeOS{
+		Channel:        "stable-channel",
+		Milestone:      "89",
+		ReleaseVersion: "13729.56.0",
+		// No uptime reported
+	}, actual)
+}
+
+func TestTryInterrogatingChromeOS_NoChromeOSData_AssumesNotAttached(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_SSHLSBRelease_ReturnsNonChromeOS",
+	)
+
+	m := &Machine{ssh: ssh.ExeImpl{}, description: machine.Description{SSHUserIP: testUserIP}}
+	_, ok := m.tryInterrogatingChromeOSDevice(ctx)
+	assert.False(t, ok)
+}
+
 func TestInterrogate_NoDeviceAttached_Success(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
-		"Test_FakeExe_ExitCodeOne", // No android device
+		"Test_FakeExe_ExitCodeOne", // No Android device
 	)
 
 	m := &Machine{
@@ -110,7 +201,7 @@
 func TestInterrogate_AndroidDeviceAttached_Success(t *testing.T) {
 	unittest.SmallTest(t)
 	ctx := executil.FakeTestsContext(
-		"Test_FakeExe_Uptime_ReturnsPlaceholder",
+		"Test_FakeExe_ADBUptime_ReturnsPlaceholder",
 		"Test_FakeExe_AdbShellGetProp_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysBattery_ReturnsPlaceholder",
 		"Test_FakeExe_RawDumpSysThermal_ReturnsPlaceholder",
@@ -148,7 +239,50 @@
 	}, actual)
 }
 
-func Test_FakeExe_Uptime_ReturnsPlaceholder(t *testing.T) {
+func TestInterrogate_ChromeOSDeviceAttached_Success(t *testing.T) {
+	unittest.SmallTest(t)
+	ctx := executil.FakeTestsContext(
+		"Test_FakeExe_SSHLSBRelease_ReturnsPlaceholder",
+		"Test_FakeExe_SSHUptime_ReturnsPlaceholder",
+		// We found a device, no need to check for adb
+	)
+
+	m := &Machine{
+		ssh:       ssh.ExeImpl{},
+		MachineID: "some-machine",
+		Hostname:  "some-hostname",
+		description: machine.Description{
+			SSHUserIP: testUserIP,
+		},
+		KubernetesImage:  "deprecated",
+		Version:          "some-version",
+		runningTask:      true,
+		startSwarming:    true,
+		startTime:        time.Date(2021, time.September, 2, 2, 2, 2, 2, time.UTC),
+		interrogateTimer: noop.Float64SummaryMetric{},
+	}
+	actual := m.interrogate(ctx)
+	assert.Equal(t, machine.Event{
+		EventType:           machine.EventTypeRawState,
+		LaunchedSwarming:    true,
+		RunningSwarmingTask: true,
+		Host: machine.Host{
+			Name:            "some-machine",
+			PodName:         "some-hostname",
+			KubernetesImage: "deprecated",
+			Version:         "some-version",
+			StartTime:       time.Date(2021, time.September, 2, 2, 2, 2, 2, time.UTC),
+		},
+		ChromeOS: machine.ChromeOS{
+			Channel:        "stable-channel",
+			Milestone:      "89",
+			ReleaseVersion: "13729.56.0",
+			Uptime:         1234500 * time.Millisecond,
+		},
+	}, actual)
+}
+
+func Test_FakeExe_ADBUptime_ReturnsPlaceholder(t *testing.T) {
 	unittest.FakeExeTest(t)
 	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
 		return
@@ -202,6 +336,51 @@
 	os.Exit(0)
 }
 
+func Test_FakeExe_SSHLSBRelease_ReturnsPlaceholder(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Contains(t, args, "ssh")
+	require.Contains(t, args, testUserIP)
+	require.Contains(t, args, "/etc/lsb-release")
+
+	fmt.Print(sampleChromeOSlsbrelease)
+	os.Exit(0)
+}
+
+func Test_FakeExe_SSHLSBRelease_ReturnsNonChromeOS(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Contains(t, args, "ssh")
+	require.Contains(t, args, testUserIP)
+	require.Contains(t, args, "/etc/lsb-release")
+
+	fmt.Print("FOO=bar")
+	os.Exit(0)
+}
+
+func Test_FakeExe_SSHUptime_ReturnsPlaceholder(t *testing.T) {
+	unittest.FakeExeTest(t)
+	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
+		return
+	}
+	// Check the input arguments to make sure they were as expected.
+	args := executil.OriginalArgs()
+	require.Contains(t, args, "ssh")
+	require.Contains(t, args, testUserIP)
+	require.Contains(t, args, "/proc/uptime")
+
+	fmt.Print(sampleChromeOSUptime)
+	os.Exit(0)
+}
+
 func Test_FakeExe_ExitCodeOne(t *testing.T) {
 	unittest.FakeExeTest(t)
 	if os.Getenv(executil.OverrideEnvironmentVariable) == "" {
diff --git a/machine/go/test_machine_monitor/ssh/BUILD.bazel b/machine/go/test_machine_monitor/ssh/BUILD.bazel
new file mode 100644
index 0000000..a3b4bfd
--- /dev/null
+++ b/machine/go/test_machine_monitor/ssh/BUILD.bazel
@@ -0,0 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "ssh",
+    srcs = ["ssh.go"],
+    importpath = "go.skia.org/infra/machine/go/test_machine_monitor/ssh",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//go/executil",
+        "//go/skerr",
+    ],
+)
diff --git a/machine/go/test_machine_monitor/ssh/ssh.go b/machine/go/test_machine_monitor/ssh/ssh.go
new file mode 100644
index 0000000..c0dc97f
--- /dev/null
+++ b/machine/go/test_machine_monitor/ssh/ssh.go
@@ -0,0 +1,38 @@
+package ssh
+
+import (
+	"context"
+	"os/exec"
+	"time"
+
+	"go.skia.org/infra/go/executil"
+	"go.skia.org/infra/go/skerr"
+)
+
+const (
+	commandTimeout = 20 * time.Second // chosen arbitrarily
+)
+
+type SSH interface {
+	Run(ctx context.Context, userIP, cmd string, args ...string) (string, error)
+}
+
+// ExeImpl runs SSH via an executable that is assumed to be on the PATH.
+type ExeImpl struct{}
+
+func (e ExeImpl) Run(ctx context.Context, userIP, cmd string, args ...string) (string, error) {
+	ctx, cancel := context.WithTimeout(ctx, commandTimeout)
+	defer cancel()
+	xargs := append([]string{"-oConnectTimeout=15", "-oBatchMode=yes",
+		"-t", "-t", // These might not work on Windows
+		userIP, cmd}, args...)
+	cc := executil.CommandContext(ctx, "ssh", xargs...)
+	b, err := cc.Output()
+	if err != nil {
+		if ee, ok := err.(*exec.ExitError); ok {
+			err = skerr.Wrapf(err, "ssh failed with stderr: %q", ee.Stderr)
+		}
+		return "", err
+	}
+	return string(b), nil
+}