blob: 27543835fb6983a1c82c127645b4a15fb1fe2217 [file] [log] [blame]
package powercycle
import (
"context"
"fmt"
"os"
"strings"
"time"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
)
const (
// Amount of time to wait between turning a port on and off again.
powerOffDelayEdgeSwitch = 5 * time.Second
// values for the poe opmode
edgeSwitchOff = "shutdown"
edgeSwitchOn = "auto"
powerCyclePasswordEnvVar = "POWERCYCLE_PASSWORD"
)
// EdgeSwitchConfig contains configuration options for a single EdgeSwitch. Authentication is
// handled via a provided password. See go/skolo-powercycle-setup for more.
type EdgeSwitchConfig struct {
// IP address of the device, i.e. 192.168.1.33
Address string `json:"address"`
// User of the ssh connection.
User string `json:"user"`
// Password for User. This can also be set by the environment variable "POWERCYCLE_PASSWORD".
Password string `json:"password"`
// Mapping between device id and port on the power strip.
DevPortMap map[DeviceID]int `json:"ports"`
}
// Validate returns an error if the configuration is not complete.
func (c *EdgeSwitchConfig) Validate() error {
if c.User == "" || c.Address == "" {
return skerr.Fmt("You must specify a user and ip address.")
}
if c.getPassword() == "" {
return skerr.Fmt("You must specify the password.")
}
return nil
}
// getPassword returns the password.
func (c *EdgeSwitchConfig) getPassword() string {
if c.Password != "" {
return c.Password
}
return strings.TrimSpace(os.Getenv(powerCyclePasswordEnvVar))
}
// edgeSwitchClient implements the Client interface.
type edgeSwitchClient struct {
conf *EdgeSwitchConfig
portDevMap map[int]DeviceID
devIDs []DeviceID
runner CommandRunner
}
// newEdgeSwitchController connects to the EdgeSwitch identified by the given
// configuration and returns a new instance of edgeSwitchClient.
//
// The *edgeSwitchClient is always returned not nil as long as the configuration
// is valid, so even on error it can be interrogated for the list of machines.
func newEdgeSwitchController(ctx context.Context, conf *EdgeSwitchConfig, connect bool) (*edgeSwitchClient, error) {
if err := conf.Validate(); err != nil {
return nil, skerr.Wrap(err)
}
target := fmt.Sprintf("%s@%s", conf.User, conf.Address)
// The -T removes a warning SSH gives because we are not invoking it over TTY.
// The -o StrictHostKeyChecking=no is added because pods don't have authorized_keys files.
runner := PasswordSSHCommandRunner(conf.getPassword(), "-T", target, "-o", "StrictHostKeyChecking=no")
ret := &edgeSwitchClient{
conf: conf,
runner: runner,
}
// Build the dev-port mappings. Ensure each device and port occur only once.
ret.portDevMap = make(map[int]DeviceID, len(conf.DevPortMap))
for id, port := range conf.DevPortMap {
if _, ok := ret.portDevMap[port]; ok {
return nil, skerr.Fmt("Port '%d' specified more than once.", port)
}
ret.portDevMap[port] = id
ret.devIDs = append(ret.devIDs, id)
}
sortIDs(ret.devIDs)
if connect {
out, _ := runner.ExecCmds(ctx, "help")
sklog.Infof("help out: %q", out)
// When using sshpass, we always seem to get exit code 255 (from ssh) and any actual errors are
// in stderr. So, we check the returned output for evidence that things actually worked
if !strings.Contains(out, "HELP") {
sklog.Info("Smoke test failed, this might be a UniFI device.")
// UniFI devices are like the EdgeMax devices, but you need to telnet back to localhost to get to the right command prompt.
runner = PasswordSSHCommandRunner(conf.getPassword(), "-T", target, "-o", "StrictHostKeyChecking=no", "-o", "RemoteCommand=\"telnet 127.0.0.1\"")
out, _ = runner.ExecCmds(ctx, "help")
if !strings.Contains(out, "HELP") {
return ret, skerr.Fmt("smoke test on edge switch %s failed; output: %s", target, out)
}
}
sklog.Infof("connected successfully to edge switch %s", target)
}
return ret, nil
}
// DeviceIDs implements the Client interface.
func (e *edgeSwitchClient) DeviceIDs() []DeviceID {
return e.devIDs
}
// PowerCycle implements the Client interface.
func (e *edgeSwitchClient) PowerCycle(ctx context.Context, id DeviceID, delayOverride time.Duration) error {
delay := powerOffDelayEdgeSwitch
if delayOverride > 0 {
delay = delayOverride
}
port, ok := e.conf.DevPortMap[id]
if !ok {
return skerr.Fmt("Invalid id: %s", id)
}
if ok := softPowerCycle(ctx, id); ok {
sklog.Infof("Was able to powercycle %s via SSH", id)
return nil
}
sklog.Infof("soft powercycle of %s failed, going to turn off POE port %d", id, port)
if err := e.setPortValue(ctx, port, edgeSwitchOff); err != nil {
return skerr.Wrapf(err, "turning port %d off", port)
}
sklog.Infof("Switched port %d off. Waiting for %s.", port, delay)
time.Sleep(delay)
if err := e.setPortValue(ctx, port, edgeSwitchOn); err != nil {
return skerr.Wrapf(err, "turning port %d back on", port)
}
sklog.Infof("Switched port %d on.", port)
return nil
}
// softPowerCycle attempts to SSH into the machine using the jumphost's private/public key and
// reboot it. This should help the jarring behavior seen when a bot is hard-rebooted frequently.
func softPowerCycle(ctx context.Context, machineName DeviceID) bool {
// We rely on a dns lookup for the bot id ("e.g. skia-rpi-001") for this to work.
// The router or the host can have it in /etc/host.
machineRunner := PublicKeySSHCommandRunner("-T", string(machineName))
// First try to run a trivial command to see if we can access the machine via SSH.
if _, err := machineRunner.ExecCmds(ctx, "time"); err != nil {
return false
}
// Do not bother checking error - this always fails because the command doesn't return after
// reboot.
out, _ := machineRunner.ExecCmds(ctx, "sudo /sbin/reboot -f")
sklog.Infof("Soft reboot should have succeeded. See logs: %s", out)
return true
}
func (e *edgeSwitchClient) setPortValue(ctx context.Context, port int, value string) error {
out, _ := e.runner.ExecCmds(ctx,
"enable",
"configure",
fmt.Sprintf("interface 0/%d", port),
fmt.Sprintf("poe opmode %s", value),
)
// When using sshpass, we always seem to get exit code 255 (from ssh) and any actual errors are
// in stderr. So, we check the returned output for evidence that things actually worked
if !strings.Contains(out, value) {
return skerr.Fmt("Error while setting port value - got output %s", out)
}
sklog.Debugf("output while setting port %d to %s:\n%s\n", port, value, out)
return nil
}