blob: 8ea6ed3ec3a67ee9ff987e6ad87110940f17ec4e [file] [log] [blame]
// Package foundrybotcustodian starts Foundry Bot to handle RBE requests and brings it up and down
// in loose synchrony with Maintenance Mode.
package foundrybotcustodian
import (
"context"
"errors"
"io/fs"
"os"
"os/exec"
"go.skia.org/infra/go/executil"
"go.skia.org/infra/go/metrics2"
"go.skia.org/infra/go/recentschannel"
"go.skia.org/infra/go/skerr"
"go.skia.org/infra/go/sklog"
)
// Start spawns a goroutine that forever brings Foundry Bot up or down in accordance with the
// wishes of machineserver: down during maintenance mode and up otherwise. These wishes are sent on
// wantUpChannel as booleans: true for up and false for down. The caller should send a steady
// heartbeat of these, and we fulfill them as promptly as possible, though allowing Foundry Bot to
// complete its current job before taking it down. We also restart Foundry Bot if it falls down on
// its own.
//
// botPath is the absolute path to a copy of Foundry Bot.
// instance is the GCP instance under which the RBE jobs run. Its project must contain a Remote
// Build Execution API endpoint under APIs & Services.
//
// Start looks for likely error conditions and returns them before starting the goroutine.
func Start(ctx context.Context, botPath string, instance string, wantUpChannel *recentschannel.Ch[bool]) error {
// Check as much as we can before spinning off the goroutine.
_, err := os.Stat(botPath)
if errors.Is(err, fs.ErrNotExist) {
return skerr.Wrapf(err, "Foundry Bot not found at %s", botPath)
}
if err != nil {
return skerr.Wrapf(err, "failed to stat() %s", botPath)
}
// Start custodianship loop.
go func() {
// exits is a channel that receives a message every time the Foundry Bot process ends for any
// reason. The value is ignored, but it serves as a synchronizer and a provocation to bring the
// process back up. It's buffered because there's no advantage in waitForProcess() delaying
// exiting until the custodian receives the value.
exits := make(chan bool, 1)
var wantUp bool // Initial value doesn't matter because we always write before read.
var cmd *exec.Cmd // cmd is nil iff process is down.
// Set up metrics.
const statusMetric = "machine_tmm_foundry_bot_status"
timeSinceProcessStarted := metrics2.NewLiveness("liveness_machine_tmm_foundry_bot", nil)
runningMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "running"})
maintenanceMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "maintenance"})
failedToStartMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "failed_to_start"})
failedToStopMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "failed_to_stop"})
for {
select {
case wantUp = <-wantUpChannel.Recv():
// A polling request to machineserver has returned.
switch {
case wantUp && cmd == nil:
cmd = startProcess(ctx, botPath, instance, exits, timeSinceProcessStarted)
// If starting the process failed, we'll have another try at the next heartbeat.
case !wantUp && cmd != nil:
cmd = stopProcess(cmd, exits)
}
case <-exits:
// Foundry Bot exited on its own. It's not supposed to do that.
cmd = nil
// Start it up again if we like, without waiting for next heartbeat.
if wantUp {
cmd = startProcess(ctx, botPath, instance, exits, timeSinceProcessStarted)
// If starting the process failed, we'll have another try at the next heartbeat.
}
case <-ctx.Done():
// For now, this is an error case, because nobody is canceling the context yet.
sklog.Infof("Foundry Bot custodian stopped: %s", err)
return
}
// Update metrics.
runningMetric.Update(metrics2.BoolToInt(wantUp && cmd != nil))
maintenanceMetric.Update(metrics2.BoolToInt(!wantUp && cmd == nil))
failedToStartMetric.Update(metrics2.BoolToInt(wantUp && cmd == nil))
failedToStopMetric.Update(metrics2.BoolToInt(!wantUp && cmd != nil))
}
}()
return nil
}
// startProcess launches the Foundry Bot process and returns a Cmd representing it. It also spins up
// a goroutine to wait for the process to exit; exit codes are sent to the exits channel to prompt a
// listener to restart the process if it likes.
//
// Returns a Cmd representing the process: nil if it wasn't successfully brought up. The process is
// killed when the context is cancelled.
func startProcess(ctx context.Context, botPath string, instance string, exits chan bool, timeSinceProcessStarted metrics2.Liveness) *exec.Cmd {
// rbeServiceAddress is the FQDN and port of the Foundry service to which the Foundry Bot should
// connect to receive tasks.
const rbeServiceAddress = "remotebuildexecution.googleapis.com:443"
cmd := executil.CommandContext(
ctx,
botPath,
"-service_address="+rbeServiceAddress,
"-instance_name="+instance,
"session",
"-sandbox=none",
// 6h timeout after sending a SIGINT to foundry_bot because our longest job is about 3.75h.
// See https://perf.skia.org/e/?queries=sub_result%3Dtask_step_s.
"-stop_time=6h")
sklog.Infof("Starting %s", cmd.String())
timeSinceProcessStarted.Reset()
err := cmd.Start()
if err == nil {
// At this point, the PID is set in the cmd, so it's valid to send signals to it.
// Spin off a separate goroutine to wait for process exit so we can continue to listen for
// transitions into maintenance mode on this one.
go waitForProcess(cmd, exits)
return cmd
} else {
sklog.Errorf("Starting Foundry Bot process failed: %s", err)
// If starting the command failed, we'll have another try at the next aspiration polling.
return nil
}
}
// waitForProcess waits for the Foundry Bot process to exit (whether by signal or normal completion)
// and notifies a channel when it does so a listener can restart the it if desired. Intended to be
// spawned as a new goroutine.
func waitForProcess(cmd *exec.Cmd, exits chan bool) {
err := cmd.Wait()
if err == nil {
sklog.Errorf("Foundry Bot exited without an error, which is unexpected in production.")
}
// If the context was canceled, err is an os/exec.ExitError with .Success() == false. It can
// also be a plain old error if something more exotic goes wrong.
exits <- false
}
// stopProcess attempts to gracefully stop the Foundry Bot process and waits for it to exit. Returns
// the passed-in Cmd if the process is still up afterward (which it can be due to errors), nil
// otherwise.
func stopProcess(cmd *exec.Cmd, exits chan bool) *exec.Cmd {
// TODO(erikrose): On Windows, send a CTRL_CLOSE_EVENT, as that gives the
// program a chance to exit gracefully. First, confirm Foundry Bot listens to that.
err := cmd.Process.Signal(os.Interrupt)
if err == nil {
// If interrupt was sent nicely, wait for process to exit. Signal()'s
// error conditions are not specified in the Golang docs, but the UNIX implementation's
// source suggests that, if the process has already exited, an error will be returned.
<-exits
return nil
} else {
// The process may already have exited on its own. We'll update isUp properly in the
// "<- exits" case of StartCustodian().
sklog.Warningf("Sending interrupt signal to Foundry Bot failed: %s", err)
return cmd
}
}