[sk8s] Review follow-up for podwatcher.

Also includes other fixes that actually get podwatcher running, like
moving all the code from bot_config to podwacher, oops.

Also adds "-o StrictHostKeyChecking=no" ssh flag to edgeswitch.go.

Change-Id: I9a6de0d2f23833d56316b61d3d4c689c3cf56072
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/290440
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/machine/go/machine/store/impl.go b/machine/go/machine/store/impl.go
index e747660..e2a95a1 100644
--- a/machine/go/machine/store/impl.go
+++ b/machine/go/machine/store/impl.go
@@ -228,6 +228,7 @@
 				})
 				if err != nil {
 					sklog.Errorf("Failed to update machine.Description PowerCycle: %s", err)
+					// Just log the error, still powercycle the machine.
 				}
 				ch <- machineID
 			}
diff --git a/sk8s/go/bot_config/machine/machine.go b/sk8s/go/bot_config/machine/machine.go
index c59b63b..8382f0f 100644
--- a/sk8s/go/bot_config/machine/machine.go
+++ b/sk8s/go/bot_config/machine/machine.go
@@ -18,7 +18,6 @@
 	"go.skia.org/infra/machine/go/machineserver/config"
 	"go.skia.org/infra/sk8s/go/bot_config/adb"
 	"go.skia.org/infra/sk8s/go/bot_config/swarming"
-	"go.skia.org/infra/skolo/go/powercycle"
 )
 
 const (
@@ -30,9 +29,6 @@
 	// store is the firestore backend store for machine state.
 	store *store.StoreImpl
 
-	// powercycleController allows power-cycling machines.
-	powercycleController powercycle.Controller
-
 	// sink is how we send machine.Events to the the machine state server.
 	sink sink.Sink
 
@@ -65,7 +61,7 @@
 }
 
 // New return an instance of *Machine.
-func New(ctx context.Context, local bool, instanceConfig config.InstanceConfig, powercycleConfigFilename string) (*Machine, error) {
+func New(ctx context.Context, local bool, instanceConfig config.InstanceConfig) (*Machine, error) {
 	store, err := store.New(ctx, false, instanceConfig)
 	if err != nil {
 		return nil, skerr.Wrapf(err, "Failed to build store instance.")
@@ -75,15 +71,6 @@
 		return nil, skerr.Wrapf(err, "Failed to build sink instance.")
 	}
 
-	var powercycleController powercycle.Controller
-	if powercycleConfigFilename != "" {
-		sklog.Info("Build powercycle.Controller from %q", powercycleConfigFilename)
-		connectOnStartup := !local
-		powercycleController, err = powercycle.ControllerFromJSON5(ctx, powercycleConfigFilename, connectOnStartup)
-		if err != nil {
-			return nil, skerr.Wrapf(err, "Failed to instantiate powercycle.Controller.")
-		}
-	}
 	machineID := os.Getenv(swarming.SwarmingBotIDEnvVar)
 	kubernetesImage := os.Getenv(swarming.KubernetesImageEnvVar)
 	hostname, err := os.Hostname()
@@ -94,7 +81,6 @@
 	return &Machine{
 		dimensions:                 machine.SwarmingDimensions{},
 		store:                      store,
-		powercycleController:       powercycleController,
 		sink:                       sink,
 		adb:                        adb.New(),
 		MachineID:                  machineID,
@@ -170,18 +156,6 @@
 		}
 	}()
 
-	if m.powercycleController != nil {
-		// Start a loop that does a firestore onsnapshot watcher that gets machine names
-		// that need to be power-cycled.
-		go func() {
-			for machineID := range m.store.WatchForPowerCycle(ctx) {
-				if err := m.powercycleController.PowerCycle(ctx, powercycle.DeviceID(machineID), 0); err != nil {
-					sklog.Errorf("Failed to powercycle %q: %s", machineID, err)
-				}
-			}
-		}()
-	}
-
 	return nil
 }
 
diff --git a/sk8s/go/bot_config/machine/machine_test.go b/sk8s/go/bot_config/machine/machine_test.go
index 90d63bf..41a70c2 100644
--- a/sk8s/go/bot_config/machine/machine_test.go
+++ b/sk8s/go/bot_config/machine/machine_test.go
@@ -19,7 +19,6 @@
 	"go.skia.org/infra/machine/go/machine/source/pubsubsource"
 	"go.skia.org/infra/machine/go/machineserver/config"
 	"go.skia.org/infra/sk8s/go/bot_config/swarming"
-	"go.skia.org/infra/skolo/go/powercycle"
 	"google.golang.org/api/option"
 )
 
@@ -63,26 +62,6 @@
 	return ctx, topic, instanceConfig
 }
 
-func TestNew_PowerCycleReadsInCorrectConfigFile(t *testing.T) {
-	// Manual because we are testing pubsub.
-	unittest.ManualTest(t)
-	ctx, _, instanceConfig := setupConfig(t)
-
-	// Set the POWERCYCLE_PASSWORD env variable.
-	oldVar := os.Getenv("POWERCYCLE_PASSWORD")
-	err := os.Setenv("POWERCYCLE_PASSWORD", "secret-stuff")
-	require.NoError(t, err)
-	defer func() {
-		err = os.Setenv("POWERCYCLE_PASSWORD", oldVar)
-		require.NoError(t, err)
-	}()
-
-	// Create a Machine instance.
-	m, err := New(ctx, true, instanceConfig, "./testdata/power-cycle-rack4.json5")
-	require.NoError(t, err)
-	assert.Equal(t, []powercycle.DeviceID{"skia-rpi2-rack4-shelf1-001", "skia-rpi2-rack4-shelf1-002", "skia-rpi2-rack4-shelf1-003"}, m.powercycleController.DeviceIDs())
-}
-
 func TestStart_InterrogatesDeviceInitiallyAndOnTimer(t *testing.T) {
 	// Manual because we are testing pubsub.
 	unittest.ManualTest(t)
@@ -114,7 +93,7 @@
 	}()
 
 	// Create a Machine instance.
-	m, err := New(ctx, true, instanceConfig, "")
+	m, err := New(ctx, true, instanceConfig)
 	require.NoError(t, err)
 	assert.Equal(t, "my-test-bot-001", m.MachineID)
 
@@ -230,7 +209,7 @@
 	}()
 
 	// Create a Machine instance.
-	m, err := New(ctx, true, instanceConfig, "")
+	m, err := New(ctx, true, instanceConfig)
 	require.NoError(t, err)
 
 	// Set up fakes for adb. We have two sets of 3 since Start calls
@@ -314,7 +293,7 @@
 	}()
 
 	// Create a Machine instance.
-	m, err := New(ctx, true, instanceConfig, "")
+	m, err := New(ctx, true, instanceConfig)
 	// We are running a task.
 	m.runningTask = true
 	require.NoError(t, err)
diff --git a/sk8s/go/bot_config/main.go b/sk8s/go/bot_config/main.go
index 984042c..44f5118 100644
--- a/sk8s/go/bot_config/main.go
+++ b/sk8s/go/bot_config/main.go
@@ -19,15 +19,14 @@
 
 // flags
 var (
-	configFlag               = flag.String("config", "", "The path to the configuration file.")
-	local                    = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
-	metadataURL              = flag.String("metadata_url", "http://metadata:8000/computeMetadata/v1/instance/service-accounts/default/token", "The URL of the metadata server that provides service account tokens.")
-	port                     = flag.String("port", ":11000", "HTTP service address (e.g., ':8000')")
-	powercycleConfigFilename = flag.String("powercycle_config", "", "The name of the config file for powercycle.Controller.")
-	promPort                 = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':10110')")
-	pythonExe                = flag.String("python_exe", "/usr/bin/python2.7", "Absolute path to Python.")
-	startSwarming            = flag.Bool("start_swarming", false, "If true then start swarming_bot.zip.")
-	swarmingBotZip           = flag.String("swarming_bot_zip", "/b/s/swarming_bot.zip", "Absolute path to where the swarming_bot.zip code should run from.")
+	configFlag     = flag.String("config", "", "The path to the configuration file.")
+	local          = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
+	metadataURL    = flag.String("metadata_url", "http://metadata:8000/computeMetadata/v1/instance/service-accounts/default/token", "The URL of the metadata server that provides service account tokens.")
+	port           = flag.String("port", ":11000", "HTTP service address (e.g., ':8000')")
+	promPort       = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':10110')")
+	pythonExe      = flag.String("python_exe", "/usr/bin/python2.7", "Absolute path to Python.")
+	startSwarming  = flag.Bool("start_swarming", false, "If true then start swarming_bot.zip.")
+	swarmingBotZip = flag.String("swarming_bot_zip", "/b/s/swarming_bot.zip", "Absolute path to where the swarming_bot.zip code should run from.")
 )
 
 func main() {
@@ -48,7 +47,7 @@
 	}
 
 	ctx := context.Background()
-	m, err := machine.New(ctx, *local, instanceConfig, *powercycleConfigFilename)
+	m, err := machine.New(ctx, *local, instanceConfig)
 	if err != nil {
 		sklog.Fatal("Failed to create machine: %s", err)
 	}
diff --git a/sk8s/go/podwatcher/main.go b/sk8s/go/podwatcher/main.go
index 9c3710a..0b11a9a 100644
--- a/sk8s/go/podwatcher/main.go
+++ b/sk8s/go/podwatcher/main.go
@@ -15,6 +15,7 @@
 	"encoding/json"
 	"flag"
 	"io"
+	"os"
 
 	"go.skia.org/infra/go/common"
 	"go.skia.org/infra/go/metrics2"
@@ -23,19 +24,25 @@
 	"go.skia.org/infra/machine/go/machine/store"
 	"go.skia.org/infra/machine/go/machineserver/config"
 	"go.skia.org/infra/sk8s/go/podwatcher/deleter"
+	"go.skia.org/infra/skolo/go/powercycle"
 )
 
 var (
 	// Flags.
-	configFlag = flag.String("config", "", "The path to the configuration file.")
-	local      = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
-	promPort   = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':20000')")
+	configFlag               = flag.String("config", "", "The path to the configuration file.")
+	local                    = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
+	powercycleConfigFilename = flag.String("powercycle_config", "", "The name of the config file for powercycle.Controller.")
+	promPort                 = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':20000')")
 )
 
 func main() {
 	common.InitWithMust("podwatcher", common.PrometheusOpt(promPort))
 	ctx := context.Background()
 
+	if *powercycleConfigFilename == "" {
+		sklog.Fatal("--powercycle_config flag must be supplied.")
+	}
+
 	var instanceConfig config.InstanceConfig
 	err := util.WithReadFile(*configFlag, func(r io.Reader) error {
 		return json.NewDecoder(r).Decode(&instanceConfig)
@@ -52,16 +59,43 @@
 		sklog.Fatalf("Failed to build deleter: %s", err)
 	}
 
+	sklog.Info("Building powercycle.Controller from %q", powercycleConfigFilename)
+	connectOnStartup := !*local
+	powercycleController, err := powercycle.ControllerFromJSON5(ctx, *powercycleConfigFilename, connectOnStartup)
+	if err != nil {
+		sklog.Fatalf("Failed to instantiate powercycle.Controller: %s", err)
+	}
+
 	successfulUpdates := metrics2.GetCounter("podreader_successful_update")
 	failedUpdates := metrics2.GetCounter("podreader_failed_update")
 
-	for podname := range store.WatchForDeletablePods(ctx) {
-		if err := deleter.Delete(ctx, podname); err != nil {
-			failedUpdates.Inc(1)
-			sklog.Errorf("Failed to update pod by deleting it: %s", err)
-			continue
+	go func() {
+		for podname := range store.WatchForDeletablePods(ctx) {
+			if err := deleter.Delete(ctx, podname); err != nil {
+				failedUpdates.Inc(1)
+				sklog.Errorf("Failed to update pod by deleting it: %s", err)
+				continue
+			}
+			sklog.Infof("Deleted: %q", podname)
+			successfulUpdates.Inc(1)
 		}
-		sklog.Infof("Deleted: %q", podname)
-		successfulUpdates.Inc(1)
-	}
+		sklog.Info("Exiting WatchForDeletablePods.")
+		os.Exit(0)
+	}()
+
+	// Start a loop that does a firestore onsnapshot watcher that gets machine names
+	// that need to be power-cycled.
+	go func() {
+		for machineID := range store.WatchForPowerCycle(ctx) {
+			if err := powercycleController.PowerCycle(ctx, powercycle.DeviceID(machineID), 0); err != nil {
+				sklog.Errorf("Failed to powercycle %q: %s", machineID, err)
+			} else {
+				sklog.Infof("Successfully powercycled: %q", machineID)
+			}
+		}
+		sklog.Info("Exiting WatchForPowerCycle")
+		os.Exit(0)
+	}()
+
+	select {}
 }
diff --git a/sk8s/images/podwatcher/Dockerfile b/sk8s/images/podwatcher/Dockerfile
index ee17e6f..7d03f22 100644
--- a/sk8s/images/podwatcher/Dockerfile
+++ b/sk8s/images/podwatcher/Dockerfile
@@ -2,7 +2,7 @@
 
 USER root
 
-RUN apk update && apk add --no-cache bash
+RUN apk update && apk add --no-cache bash sshpass openssh-client
 
 COPY . /
 
diff --git a/sk8s/images/podwatcher/release b/sk8s/images/podwatcher/release
index 3950a32..29e00d2 100755
--- a/sk8s/images/podwatcher/release
+++ b/sk8s/images/podwatcher/release
@@ -12,11 +12,11 @@
 INSTALL="install -D --verbose --backup=none"
 
 # Add the dockerfile and binary.
-${INSTALL} --mode=644 -T ${IMAGE}/Dockerfile                ${ROOT}/Dockerfile
-${INSTALL} --mode=755 -T ${GOPATH}/bin/podwatcher           ${ROOT}/usr/local/bin/${APPNAME}
-${INSTALL_DIR} --mode=755                                   ${ROOT}/usr/local/share/${APPNAME}/
-${INSTALL} --mode=644 ../machine/configs/*                  ${ROOT}/usr/local/share/${APPNAME}/
-${INSTALL} --mode=755 -T ${IMAGE}/power-cycle-rack4.json5   ${ROOT}/usr/local/share/${APPNAME}/power-cycle-rack4.json5
+${INSTALL} --mode=644 -T ${IMAGE}/Dockerfile                                 ${ROOT}/Dockerfile
+${INSTALL} --mode=755 -T ${GOPATH}/bin/podwatcher                            ${ROOT}/usr/local/bin/${APPNAME}
+${INSTALL_DIR} --mode=755                                                    ${ROOT}/usr/local/share/${APPNAME}/
+${INSTALL} --mode=644 ../machine/configs/*                                   ${ROOT}/usr/local/share/${APPNAME}/
+${INSTALL} --mode=755 -T ${IMAGE}/../../../skolo/sys/power-cycle-rack4.json5 ${ROOT}/usr/local/share/${APPNAME}/power-cycle-rack4.json5
 }
 
 source ../bash/docker_build.sh
diff --git a/skolo/go/powercycle/edgeswitch.go b/skolo/go/powercycle/edgeswitch.go
index b4786c8..e8367e9 100644
--- a/skolo/go/powercycle/edgeswitch.go
+++ b/skolo/go/powercycle/edgeswitch.go
@@ -54,7 +54,7 @@
 	if c.Password != "" {
 		return c.Password
 	}
-	return os.Getenv(powerCyclePasswordEnvVar)
+	return strings.TrimSpace(os.Getenv(powerCyclePasswordEnvVar))
 }
 
 // edgeSwitchClient implements the Client interface.
@@ -73,7 +73,7 @@
 	}
 	target := fmt.Sprintf("%s@%s", conf.User, conf.Address)
 	// The -T removes a warning SSH gives because we are not invoking it over TTY.
-	runner := PasswordSSHCommandRunner(conf.getPassword(), "-T", target)
+	runner := PasswordSSHCommandRunner(conf.getPassword(), "-T", target, "-o", "StrictHostKeyChecking=no")
 	if connect {
 		out, _ := runner.ExecCmds(ctx, "help")
 		// When using sshpass, we always seem to get exit code 255 (from ssh) and any actual errors are
diff --git a/sk8s/images/podwatcher/power-cycle-rack4.json5 b/skolo/sys/power-cycle-rack4.json5
similarity index 100%
rename from sk8s/images/podwatcher/power-cycle-rack4.json5
rename to skolo/sys/power-cycle-rack4.json5