[sk8s] Review follow-up for podwatcher.
Also includes other fixes that actually get podwatcher running, like
moving all the code from bot_config to podwacher, oops.
Also adds "-o StrictHostKeyChecking=no" ssh flag to edgeswitch.go.
Change-Id: I9a6de0d2f23833d56316b61d3d4c689c3cf56072
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/290440
Reviewed-by: Joe Gregorio <jcgregorio@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/machine/go/machine/store/impl.go b/machine/go/machine/store/impl.go
index e747660..e2a95a1 100644
--- a/machine/go/machine/store/impl.go
+++ b/machine/go/machine/store/impl.go
@@ -228,6 +228,7 @@
})
if err != nil {
sklog.Errorf("Failed to update machine.Description PowerCycle: %s", err)
+ // Just log the error, still powercycle the machine.
}
ch <- machineID
}
diff --git a/sk8s/go/bot_config/machine/machine.go b/sk8s/go/bot_config/machine/machine.go
index c59b63b..8382f0f 100644
--- a/sk8s/go/bot_config/machine/machine.go
+++ b/sk8s/go/bot_config/machine/machine.go
@@ -18,7 +18,6 @@
"go.skia.org/infra/machine/go/machineserver/config"
"go.skia.org/infra/sk8s/go/bot_config/adb"
"go.skia.org/infra/sk8s/go/bot_config/swarming"
- "go.skia.org/infra/skolo/go/powercycle"
)
const (
@@ -30,9 +29,6 @@
// store is the firestore backend store for machine state.
store *store.StoreImpl
- // powercycleController allows power-cycling machines.
- powercycleController powercycle.Controller
-
// sink is how we send machine.Events to the the machine state server.
sink sink.Sink
@@ -65,7 +61,7 @@
}
// New return an instance of *Machine.
-func New(ctx context.Context, local bool, instanceConfig config.InstanceConfig, powercycleConfigFilename string) (*Machine, error) {
+func New(ctx context.Context, local bool, instanceConfig config.InstanceConfig) (*Machine, error) {
store, err := store.New(ctx, false, instanceConfig)
if err != nil {
return nil, skerr.Wrapf(err, "Failed to build store instance.")
@@ -75,15 +71,6 @@
return nil, skerr.Wrapf(err, "Failed to build sink instance.")
}
- var powercycleController powercycle.Controller
- if powercycleConfigFilename != "" {
- sklog.Info("Build powercycle.Controller from %q", powercycleConfigFilename)
- connectOnStartup := !local
- powercycleController, err = powercycle.ControllerFromJSON5(ctx, powercycleConfigFilename, connectOnStartup)
- if err != nil {
- return nil, skerr.Wrapf(err, "Failed to instantiate powercycle.Controller.")
- }
- }
machineID := os.Getenv(swarming.SwarmingBotIDEnvVar)
kubernetesImage := os.Getenv(swarming.KubernetesImageEnvVar)
hostname, err := os.Hostname()
@@ -94,7 +81,6 @@
return &Machine{
dimensions: machine.SwarmingDimensions{},
store: store,
- powercycleController: powercycleController,
sink: sink,
adb: adb.New(),
MachineID: machineID,
@@ -170,18 +156,6 @@
}
}()
- if m.powercycleController != nil {
- // Start a loop that does a firestore onsnapshot watcher that gets machine names
- // that need to be power-cycled.
- go func() {
- for machineID := range m.store.WatchForPowerCycle(ctx) {
- if err := m.powercycleController.PowerCycle(ctx, powercycle.DeviceID(machineID), 0); err != nil {
- sklog.Errorf("Failed to powercycle %q: %s", machineID, err)
- }
- }
- }()
- }
-
return nil
}
diff --git a/sk8s/go/bot_config/machine/machine_test.go b/sk8s/go/bot_config/machine/machine_test.go
index 90d63bf..41a70c2 100644
--- a/sk8s/go/bot_config/machine/machine_test.go
+++ b/sk8s/go/bot_config/machine/machine_test.go
@@ -19,7 +19,6 @@
"go.skia.org/infra/machine/go/machine/source/pubsubsource"
"go.skia.org/infra/machine/go/machineserver/config"
"go.skia.org/infra/sk8s/go/bot_config/swarming"
- "go.skia.org/infra/skolo/go/powercycle"
"google.golang.org/api/option"
)
@@ -63,26 +62,6 @@
return ctx, topic, instanceConfig
}
-func TestNew_PowerCycleReadsInCorrectConfigFile(t *testing.T) {
- // Manual because we are testing pubsub.
- unittest.ManualTest(t)
- ctx, _, instanceConfig := setupConfig(t)
-
- // Set the POWERCYCLE_PASSWORD env variable.
- oldVar := os.Getenv("POWERCYCLE_PASSWORD")
- err := os.Setenv("POWERCYCLE_PASSWORD", "secret-stuff")
- require.NoError(t, err)
- defer func() {
- err = os.Setenv("POWERCYCLE_PASSWORD", oldVar)
- require.NoError(t, err)
- }()
-
- // Create a Machine instance.
- m, err := New(ctx, true, instanceConfig, "./testdata/power-cycle-rack4.json5")
- require.NoError(t, err)
- assert.Equal(t, []powercycle.DeviceID{"skia-rpi2-rack4-shelf1-001", "skia-rpi2-rack4-shelf1-002", "skia-rpi2-rack4-shelf1-003"}, m.powercycleController.DeviceIDs())
-}
-
func TestStart_InterrogatesDeviceInitiallyAndOnTimer(t *testing.T) {
// Manual because we are testing pubsub.
unittest.ManualTest(t)
@@ -114,7 +93,7 @@
}()
// Create a Machine instance.
- m, err := New(ctx, true, instanceConfig, "")
+ m, err := New(ctx, true, instanceConfig)
require.NoError(t, err)
assert.Equal(t, "my-test-bot-001", m.MachineID)
@@ -230,7 +209,7 @@
}()
// Create a Machine instance.
- m, err := New(ctx, true, instanceConfig, "")
+ m, err := New(ctx, true, instanceConfig)
require.NoError(t, err)
// Set up fakes for adb. We have two sets of 3 since Start calls
@@ -314,7 +293,7 @@
}()
// Create a Machine instance.
- m, err := New(ctx, true, instanceConfig, "")
+ m, err := New(ctx, true, instanceConfig)
// We are running a task.
m.runningTask = true
require.NoError(t, err)
diff --git a/sk8s/go/bot_config/main.go b/sk8s/go/bot_config/main.go
index 984042c..44f5118 100644
--- a/sk8s/go/bot_config/main.go
+++ b/sk8s/go/bot_config/main.go
@@ -19,15 +19,14 @@
// flags
var (
- configFlag = flag.String("config", "", "The path to the configuration file.")
- local = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
- metadataURL = flag.String("metadata_url", "http://metadata:8000/computeMetadata/v1/instance/service-accounts/default/token", "The URL of the metadata server that provides service account tokens.")
- port = flag.String("port", ":11000", "HTTP service address (e.g., ':8000')")
- powercycleConfigFilename = flag.String("powercycle_config", "", "The name of the config file for powercycle.Controller.")
- promPort = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':10110')")
- pythonExe = flag.String("python_exe", "/usr/bin/python2.7", "Absolute path to Python.")
- startSwarming = flag.Bool("start_swarming", false, "If true then start swarming_bot.zip.")
- swarmingBotZip = flag.String("swarming_bot_zip", "/b/s/swarming_bot.zip", "Absolute path to where the swarming_bot.zip code should run from.")
+ configFlag = flag.String("config", "", "The path to the configuration file.")
+ local = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
+ metadataURL = flag.String("metadata_url", "http://metadata:8000/computeMetadata/v1/instance/service-accounts/default/token", "The URL of the metadata server that provides service account tokens.")
+ port = flag.String("port", ":11000", "HTTP service address (e.g., ':8000')")
+ promPort = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':10110')")
+ pythonExe = flag.String("python_exe", "/usr/bin/python2.7", "Absolute path to Python.")
+ startSwarming = flag.Bool("start_swarming", false, "If true then start swarming_bot.zip.")
+ swarmingBotZip = flag.String("swarming_bot_zip", "/b/s/swarming_bot.zip", "Absolute path to where the swarming_bot.zip code should run from.")
)
func main() {
@@ -48,7 +47,7 @@
}
ctx := context.Background()
- m, err := machine.New(ctx, *local, instanceConfig, *powercycleConfigFilename)
+ m, err := machine.New(ctx, *local, instanceConfig)
if err != nil {
sklog.Fatal("Failed to create machine: %s", err)
}
diff --git a/sk8s/go/podwatcher/main.go b/sk8s/go/podwatcher/main.go
index 9c3710a..0b11a9a 100644
--- a/sk8s/go/podwatcher/main.go
+++ b/sk8s/go/podwatcher/main.go
@@ -15,6 +15,7 @@
"encoding/json"
"flag"
"io"
+ "os"
"go.skia.org/infra/go/common"
"go.skia.org/infra/go/metrics2"
@@ -23,19 +24,25 @@
"go.skia.org/infra/machine/go/machine/store"
"go.skia.org/infra/machine/go/machineserver/config"
"go.skia.org/infra/sk8s/go/podwatcher/deleter"
+ "go.skia.org/infra/skolo/go/powercycle"
)
var (
// Flags.
- configFlag = flag.String("config", "", "The path to the configuration file.")
- local = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
- promPort = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':20000')")
+ configFlag = flag.String("config", "", "The path to the configuration file.")
+ local = flag.Bool("local", false, "Running locally if true. As opposed to in production.")
+ powercycleConfigFilename = flag.String("powercycle_config", "", "The name of the config file for powercycle.Controller.")
+ promPort = flag.String("prom_port", ":20000", "Metrics service address (e.g., ':20000')")
)
func main() {
common.InitWithMust("podwatcher", common.PrometheusOpt(promPort))
ctx := context.Background()
+ if *powercycleConfigFilename == "" {
+ sklog.Fatal("--powercycle_config flag must be supplied.")
+ }
+
var instanceConfig config.InstanceConfig
err := util.WithReadFile(*configFlag, func(r io.Reader) error {
return json.NewDecoder(r).Decode(&instanceConfig)
@@ -52,16 +59,43 @@
sklog.Fatalf("Failed to build deleter: %s", err)
}
+ sklog.Info("Building powercycle.Controller from %q", powercycleConfigFilename)
+ connectOnStartup := !*local
+ powercycleController, err := powercycle.ControllerFromJSON5(ctx, *powercycleConfigFilename, connectOnStartup)
+ if err != nil {
+ sklog.Fatalf("Failed to instantiate powercycle.Controller: %s", err)
+ }
+
successfulUpdates := metrics2.GetCounter("podreader_successful_update")
failedUpdates := metrics2.GetCounter("podreader_failed_update")
- for podname := range store.WatchForDeletablePods(ctx) {
- if err := deleter.Delete(ctx, podname); err != nil {
- failedUpdates.Inc(1)
- sklog.Errorf("Failed to update pod by deleting it: %s", err)
- continue
+ go func() {
+ for podname := range store.WatchForDeletablePods(ctx) {
+ if err := deleter.Delete(ctx, podname); err != nil {
+ failedUpdates.Inc(1)
+ sklog.Errorf("Failed to update pod by deleting it: %s", err)
+ continue
+ }
+ sklog.Infof("Deleted: %q", podname)
+ successfulUpdates.Inc(1)
}
- sklog.Infof("Deleted: %q", podname)
- successfulUpdates.Inc(1)
- }
+ sklog.Info("Exiting WatchForDeletablePods.")
+ os.Exit(0)
+ }()
+
+ // Start a loop that does a firestore onsnapshot watcher that gets machine names
+ // that need to be power-cycled.
+ go func() {
+ for machineID := range store.WatchForPowerCycle(ctx) {
+ if err := powercycleController.PowerCycle(ctx, powercycle.DeviceID(machineID), 0); err != nil {
+ sklog.Errorf("Failed to powercycle %q: %s", machineID, err)
+ } else {
+ sklog.Infof("Successfully powercycled: %q", machineID)
+ }
+ }
+ sklog.Info("Exiting WatchForPowerCycle")
+ os.Exit(0)
+ }()
+
+ select {}
}
diff --git a/sk8s/images/podwatcher/Dockerfile b/sk8s/images/podwatcher/Dockerfile
index ee17e6f..7d03f22 100644
--- a/sk8s/images/podwatcher/Dockerfile
+++ b/sk8s/images/podwatcher/Dockerfile
@@ -2,7 +2,7 @@
USER root
-RUN apk update && apk add --no-cache bash
+RUN apk update && apk add --no-cache bash sshpass openssh-client
COPY . /
diff --git a/sk8s/images/podwatcher/release b/sk8s/images/podwatcher/release
index 3950a32..29e00d2 100755
--- a/sk8s/images/podwatcher/release
+++ b/sk8s/images/podwatcher/release
@@ -12,11 +12,11 @@
INSTALL="install -D --verbose --backup=none"
# Add the dockerfile and binary.
-${INSTALL} --mode=644 -T ${IMAGE}/Dockerfile ${ROOT}/Dockerfile
-${INSTALL} --mode=755 -T ${GOPATH}/bin/podwatcher ${ROOT}/usr/local/bin/${APPNAME}
-${INSTALL_DIR} --mode=755 ${ROOT}/usr/local/share/${APPNAME}/
-${INSTALL} --mode=644 ../machine/configs/* ${ROOT}/usr/local/share/${APPNAME}/
-${INSTALL} --mode=755 -T ${IMAGE}/power-cycle-rack4.json5 ${ROOT}/usr/local/share/${APPNAME}/power-cycle-rack4.json5
+${INSTALL} --mode=644 -T ${IMAGE}/Dockerfile ${ROOT}/Dockerfile
+${INSTALL} --mode=755 -T ${GOPATH}/bin/podwatcher ${ROOT}/usr/local/bin/${APPNAME}
+${INSTALL_DIR} --mode=755 ${ROOT}/usr/local/share/${APPNAME}/
+${INSTALL} --mode=644 ../machine/configs/* ${ROOT}/usr/local/share/${APPNAME}/
+${INSTALL} --mode=755 -T ${IMAGE}/../../../skolo/sys/power-cycle-rack4.json5 ${ROOT}/usr/local/share/${APPNAME}/power-cycle-rack4.json5
}
source ../bash/docker_build.sh
diff --git a/skolo/go/powercycle/edgeswitch.go b/skolo/go/powercycle/edgeswitch.go
index b4786c8..e8367e9 100644
--- a/skolo/go/powercycle/edgeswitch.go
+++ b/skolo/go/powercycle/edgeswitch.go
@@ -54,7 +54,7 @@
if c.Password != "" {
return c.Password
}
- return os.Getenv(powerCyclePasswordEnvVar)
+ return strings.TrimSpace(os.Getenv(powerCyclePasswordEnvVar))
}
// edgeSwitchClient implements the Client interface.
@@ -73,7 +73,7 @@
}
target := fmt.Sprintf("%s@%s", conf.User, conf.Address)
// The -T removes a warning SSH gives because we are not invoking it over TTY.
- runner := PasswordSSHCommandRunner(conf.getPassword(), "-T", target)
+ runner := PasswordSSHCommandRunner(conf.getPassword(), "-T", target, "-o", "StrictHostKeyChecking=no")
if connect {
out, _ := runner.ExecCmds(ctx, "help")
// When using sshpass, we always seem to get exit code 255 (from ssh) and any actual errors are
diff --git a/sk8s/images/podwatcher/power-cycle-rack4.json5 b/skolo/sys/power-cycle-rack4.json5
similarity index 100%
rename from sk8s/images/podwatcher/power-cycle-rack4.json5
rename to skolo/sys/power-cycle-rack4.json5