machine/go/test_machine_monitor/foundrybotcustodian/foundrybotcustodian.go - buildbot.git - Git at Google

 // Package foundrybotcustodian starts Foundry Bot to handle RBE requests and brings it up and down
 // in loose synchrony with Maintenance Mode.
 package foundrybotcustodian

 import (
 	"context"
 	"errors"
 	"io/fs"
 	"os"
 	"os/exec"

 	"go.skia.org/infra/go/executil"
 	"go.skia.org/infra/go/metrics2"
 	"go.skia.org/infra/go/recentschannel"
 	"go.skia.org/infra/go/skerr"
 	"go.skia.org/infra/go/sklog"
 )

 // Start spawns a goroutine that forever brings Foundry Bot up or down in accordance with the
 // wishes of machineserver: down during maintenance mode and up otherwise. These wishes are sent on
 // wantUpChannel as booleans: true for up and false for down. The caller should send a steady
 // heartbeat of these, and we fulfill them as promptly as possible, though allowing Foundry Bot to
 // complete its current job before taking it down. We also restart Foundry Bot if it falls down on
 // its own.
 //
 // botPath is the absolute path to a copy of Foundry Bot.
 // instance is the GCP instance under which the RBE jobs run. Its project must contain a Remote
 //     Build Execution API endpoint under APIs & Services.
 //
 // Start looks for likely error conditions and returns them before starting the goroutine.
 func Start(ctx context.Context, botPath string, instance string, wantUpChannel *recentschannel.Ch[bool]) error {
 	// Check as much as we can before spinning off the goroutine.
 	_, err := os.Stat(botPath)
 	if errors.Is(err, fs.ErrNotExist) {
 		return skerr.Wrapf(err, "Foundry Bot not found at %s", botPath)
 	}
 	if err != nil {
 		return skerr.Wrapf(err, "failed to stat() %s", botPath)
 	}

 	// Start custodianship loop.
 	go func() {
 		// exits is a channel that receives a message every time the Foundry Bot process ends for any
 		// reason. The value is ignored, but it serves as a synchronizer and a provocation to bring the
 		// process back up. It's buffered because there's no advantage in waitForProcess() delaying
 		// exiting until the custodian receives the value.
 		exits := make(chan bool, 1)
 		var wantUp bool   // Initial value doesn't matter because we always write before read.
 		var cmd *exec.Cmd // cmd is nil iff process is down.

 		// Set up metrics.
 		const statusMetric = "machine_tmm_foundry_bot_status"
 		timeSinceProcessStarted := metrics2.NewLiveness("liveness_machine_tmm_foundry_bot", nil)
 		runningMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "running"})
 		maintenanceMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "maintenance"})
 		failedToStartMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "failed_to_start"})
 		failedToStopMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "failed_to_stop"})

 		for {
 			select {
 			case wantUp = <-wantUpChannel.Recv():
 				// A polling request to machineserver has returned.
 				switch {
 				case wantUp && cmd == nil:
 					cmd = startProcess(ctx, botPath, instance, exits, timeSinceProcessStarted)
 					// If starting the process failed, we'll have another try at the next heartbeat.
 				case !wantUp && cmd != nil:
 					cmd = stopProcess(cmd, exits)
 				}
 			case <-exits:
 				// Foundry Bot exited on its own. It's not supposed to do that.
 				cmd = nil
 				// Start it up again if we like, without waiting for next heartbeat.
 				if wantUp {
 					cmd = startProcess(ctx, botPath, instance, exits, timeSinceProcessStarted)
 					// If starting the process failed, we'll have another try at the next heartbeat.
 				}
 			case <-ctx.Done():
 				// For now, this is an error case, because nobody is canceling the context yet.
 				sklog.Infof("Foundry Bot custodian stopped: %s", err)
 				return
 			}

 			// Update metrics.
 			runningMetric.Update(metrics2.BoolToInt(wantUp && cmd != nil))
 			maintenanceMetric.Update(metrics2.BoolToInt(!wantUp && cmd == nil))
 			failedToStartMetric.Update(metrics2.BoolToInt(wantUp && cmd == nil))
 			failedToStopMetric.Update(metrics2.BoolToInt(!wantUp && cmd != nil))
 		}
 	}()
 	return nil
 }

 // startProcess launches the Foundry Bot process and returns a Cmd representing it. It also spins up
 // a goroutine to wait for the process to exit; exit codes are sent to the exits channel to prompt a
 // listener to restart the process if it likes.
 //
 // Returns a Cmd representing the process: nil if it wasn't successfully brought up. The process is
 // killed when the context is cancelled.
 func startProcess(ctx context.Context, botPath string, instance string, exits chan bool, timeSinceProcessStarted metrics2.Liveness) *exec.Cmd {
 	// rbeServiceAddress is the FQDN and port of the Foundry service to which the Foundry Bot should
 	// connect to receive tasks.
 	const rbeServiceAddress = "remotebuildexecution.googleapis.com:443"
 	cmd := executil.CommandContext(
 		ctx,
 		botPath,
 		"-service_address="+rbeServiceAddress,
 		"-instance_name="+instance,
 		"session",
 		"-sandbox=none",
 		// 6h timeout after sending a SIGINT to foundry_bot because our longest job is about 3.75h.
 		// See https://perf.skia.org/e/?queries=sub_result%3Dtask_step_s.
 		"-stop_time=6h")
 	sklog.Infof("Starting %s", cmd.String())
 	timeSinceProcessStarted.Reset()
 	err := cmd.Start()
 	if err == nil {
 		// At this point, the PID is set in the cmd, so it's valid to send signals to it.

 		// Spin off a separate goroutine to wait for process exit so we can continue to listen for
 		// transitions into maintenance mode on this one.
 		go waitForProcess(cmd, exits)

 		return cmd
 	} else {
 		sklog.Errorf("Starting Foundry Bot process failed: %s", err)

 		// If starting the command failed, we'll have another try at the next aspiration polling.
 		return nil
 	}
 }

 // waitForProcess waits for the Foundry Bot process to exit (whether by signal or normal completion)
 // and notifies a channel when it does so a listener can restart the it if desired. Intended to be
 // spawned as a new goroutine.
 func waitForProcess(cmd *exec.Cmd, exits chan bool) {
 	err := cmd.Wait()
 	if err == nil {
 		sklog.Errorf("Foundry Bot exited without an error, which is unexpected in production.")
 	}
 	// If the context was canceled, err is an os/exec.ExitError with .Success() == false. It can
 	// also be a plain old error if something more exotic goes wrong.
 	exits <- false
 }

 // stopProcess attempts to gracefully stop the Foundry Bot process and waits for it to exit. Returns
 // the passed-in Cmd if the process is still up afterward (which it can be due to errors), nil
 // otherwise.
 func stopProcess(cmd *exec.Cmd, exits chan bool) *exec.Cmd {
 	// TODO(erikrose): On Windows, send a CTRL_CLOSE_EVENT, as that gives the
 	// program a chance to exit gracefully. First, confirm Foundry Bot listens to that.
 	err := cmd.Process.Signal(os.Interrupt)
 	if err == nil {
 		// If interrupt was sent nicely, wait for process to exit. Signal()'s
 		// error conditions are not specified in the Golang docs, but the UNIX implementation's
 		// source suggests that, if the process has already exited, an error will be returned.
 		<-exits
 		return nil
 	} else {
 		// The process may already have exited on its own. We'll update isUp properly in the
 		// "<- exits" case of StartCustodian().
 		sklog.Warningf("Sending interrupt signal to Foundry Bot failed: %s", err)
 		return cmd
 	}
 }
	// Package foundrybotcustodian starts Foundry Bot to handle RBE requests and brings it up and down
	// in loose synchrony with Maintenance Mode.
	package foundrybotcustodian

	import (
	"context"
	"errors"
	"io/fs"
	"os"
	"os/exec"

	"go.skia.org/infra/go/executil"
	"go.skia.org/infra/go/metrics2"
	"go.skia.org/infra/go/recentschannel"
	"go.skia.org/infra/go/skerr"
	"go.skia.org/infra/go/sklog"
	)

	// Start spawns a goroutine that forever brings Foundry Bot up or down in accordance with the
	// wishes of machineserver: down during maintenance mode and up otherwise. These wishes are sent on
	// wantUpChannel as booleans: true for up and false for down. The caller should send a steady
	// heartbeat of these, and we fulfill them as promptly as possible, though allowing Foundry Bot to
	// complete its current job before taking it down. We also restart Foundry Bot if it falls down on
	// its own.
	//
	// botPath is the absolute path to a copy of Foundry Bot.
	// instance is the GCP instance under which the RBE jobs run. Its project must contain a Remote
	// Build Execution API endpoint under APIs & Services.
	//
	// Start looks for likely error conditions and returns them before starting the goroutine.
	func Start(ctx context.Context, botPath string, instance string, wantUpChannel *recentschannel.Ch[bool]) error {
	// Check as much as we can before spinning off the goroutine.
	_, err := os.Stat(botPath)
	if errors.Is(err, fs.ErrNotExist) {
	return skerr.Wrapf(err, "Foundry Bot not found at %s", botPath)
	}
	if err != nil {
	return skerr.Wrapf(err, "failed to stat() %s", botPath)
	}

	// Start custodianship loop.
	go func() {
	// exits is a channel that receives a message every time the Foundry Bot process ends for any
	// reason. The value is ignored, but it serves as a synchronizer and a provocation to bring the
	// process back up. It's buffered because there's no advantage in waitForProcess() delaying
	// exiting until the custodian receives the value.
	exits := make(chan bool, 1)
	var wantUp bool // Initial value doesn't matter because we always write before read.
	var cmd *exec.Cmd // cmd is nil iff process is down.

	// Set up metrics.
	const statusMetric = "machine_tmm_foundry_bot_status"
	timeSinceProcessStarted := metrics2.NewLiveness("liveness_machine_tmm_foundry_bot", nil)
	runningMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "running"})
	maintenanceMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "maintenance"})
	failedToStartMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "failed_to_start"})
	failedToStopMetric := metrics2.GetInt64Metric(statusMetric, map[string]string{"status": "failed_to_stop"})

	for {
	select {
	case wantUp = <-wantUpChannel.Recv():
	// A polling request to machineserver has returned.
	switch {
	case wantUp && cmd == nil:
	cmd = startProcess(ctx, botPath, instance, exits, timeSinceProcessStarted)
	// If starting the process failed, we'll have another try at the next heartbeat.
	case !wantUp && cmd != nil:
	cmd = stopProcess(cmd, exits)
	}
	case <-exits:
	// Foundry Bot exited on its own. It's not supposed to do that.
	cmd = nil
	// Start it up again if we like, without waiting for next heartbeat.
	if wantUp {
	cmd = startProcess(ctx, botPath, instance, exits, timeSinceProcessStarted)
	// If starting the process failed, we'll have another try at the next heartbeat.
	}
	case <-ctx.Done():
	// For now, this is an error case, because nobody is canceling the context yet.
	sklog.Infof("Foundry Bot custodian stopped: %s", err)
	return
	}

	// Update metrics.
	runningMetric.Update(metrics2.BoolToInt(wantUp && cmd != nil))
	maintenanceMetric.Update(metrics2.BoolToInt(!wantUp && cmd == nil))
	failedToStartMetric.Update(metrics2.BoolToInt(wantUp && cmd == nil))
	failedToStopMetric.Update(metrics2.BoolToInt(!wantUp && cmd != nil))
	}
	}()
	return nil
	}

	// startProcess launches the Foundry Bot process and returns a Cmd representing it. It also spins up
	// a goroutine to wait for the process to exit; exit codes are sent to the exits channel to prompt a
	// listener to restart the process if it likes.
	//
	// Returns a Cmd representing the process: nil if it wasn't successfully brought up. The process is
	// killed when the context is cancelled.
	func startProcess(ctx context.Context, botPath string, instance string, exits chan bool, timeSinceProcessStarted metrics2.Liveness) *exec.Cmd {
	// rbeServiceAddress is the FQDN and port of the Foundry service to which the Foundry Bot should
	// connect to receive tasks.
	const rbeServiceAddress = "remotebuildexecution.googleapis.com:443"
	cmd := executil.CommandContext(
	ctx,
	botPath,
	"-service_address="+rbeServiceAddress,
	"-instance_name="+instance,
	"session",
	"-sandbox=none",
	// 6h timeout after sending a SIGINT to foundry_bot because our longest job is about 3.75h.
	// See https://perf.skia.org/e/?queries=sub_result%3Dtask_step_s.
	"-stop_time=6h")
	sklog.Infof("Starting %s", cmd.String())
	timeSinceProcessStarted.Reset()
	err := cmd.Start()
	if err == nil {
	// At this point, the PID is set in the cmd, so it's valid to send signals to it.

	// Spin off a separate goroutine to wait for process exit so we can continue to listen for
	// transitions into maintenance mode on this one.
	go waitForProcess(cmd, exits)

	return cmd
	} else {
	sklog.Errorf("Starting Foundry Bot process failed: %s", err)

	// If starting the command failed, we'll have another try at the next aspiration polling.
	return nil
	}
	}

	// waitForProcess waits for the Foundry Bot process to exit (whether by signal or normal completion)
	// and notifies a channel when it does so a listener can restart the it if desired. Intended to be
	// spawned as a new goroutine.
	func waitForProcess(cmd *exec.Cmd, exits chan bool) {
	err := cmd.Wait()
	if err == nil {
	sklog.Errorf("Foundry Bot exited without an error, which is unexpected in production.")
	}
	// If the context was canceled, err is an os/exec.ExitError with .Success() == false. It can
	// also be a plain old error if something more exotic goes wrong.
	exits <- false
	}

	// stopProcess attempts to gracefully stop the Foundry Bot process and waits for it to exit. Returns
	// the passed-in Cmd if the process is still up afterward (which it can be due to errors), nil
	// otherwise.
	func stopProcess(cmd exec.Cmd, exits chan bool) exec.Cmd {
	// TODO(erikrose): On Windows, send a CTRL_CLOSE_EVENT, as that gives the
	// program a chance to exit gracefully. First, confirm Foundry Bot listens to that.
	err := cmd.Process.Signal(os.Interrupt)
	if err == nil {
	// If interrupt was sent nicely, wait for process to exit. Signal()'s
	// error conditions are not specified in the Golang docs, but the UNIX implementation's
	// source suggests that, if the process has already exited, an error will be returned.
	<-exits
	return nil
	} else {
	// The process may already have exited on its own. We'll update isUp properly in the
	// "<- exits" case of StartCustodian().
	sklog.Warningf("Sending interrupt signal to Foundry Bot failed: %s", err)
	return cmd
	}
	}