blob: ecdc31795c058446cfde85a57ed620a27e4ae3c4 [file] [log] [blame]
// Package server is the core functionality of test_machine_monitor.
package server
import (
"encoding/json"
"net/http"
"os"
"time"
"github.com/gorilla/mux"
"go.skia.org/infra/go/httputils"
"go.skia.org/infra/go/metrics2"
"go.skia.org/infra/go/sklog"
"go.skia.org/infra/machine/go/test_machine_monitor/machine"
)
const (
serverReadTimeout = 5 * time.Minute
serverWriteTimeout = 5 * time.Minute
OnBeforeTaskPath = "/on_before_task"
OnAfterTaskPath = "/on_after_task"
)
// Server is the core functionality of test_machine_monitor.
type Server struct {
r *mux.Router
machine *machine.Machine
triggerInterrogationCh chan<- bool
getStateRequests metrics2.Counter
getStateRequestsSuccess metrics2.Counter
getSettingsRequests metrics2.Counter
getSettingsRequestsSuccess metrics2.Counter
getDimensionsRequests metrics2.Counter
getDimensionsRequestsSuccess metrics2.Counter
onBeforeTaskSuccess metrics2.Counter
onAfterTaskSuccess metrics2.Counter
}
// New returns a new instance of Server.
func New(m *machine.Machine, triggerInterrogationCh chan<- bool) (*Server, error) {
r := mux.NewRouter()
ret := &Server{
r: r,
machine: m,
triggerInterrogationCh: triggerInterrogationCh,
getStateRequests: metrics2.GetCounter("bot_config_server_get_state_requests", map[string]string{"machine": m.MachineID}),
getStateRequestsSuccess: metrics2.GetCounter("bot_config_server_get_state_requests_success", map[string]string{"machine": m.MachineID}),
getSettingsRequests: metrics2.GetCounter("bot_config_server_get_settings_requests", map[string]string{"machine": m.MachineID}),
getSettingsRequestsSuccess: metrics2.GetCounter("bot_config_server_get_settings_requests_success", map[string]string{"machine": m.MachineID}),
getDimensionsRequests: metrics2.GetCounter("bot_config_server_get_dimensions_requests", map[string]string{"machine": m.MachineID}),
getDimensionsRequestsSuccess: metrics2.GetCounter("bot_config_server_get_dimensions_requests_success", map[string]string{"machine": m.MachineID}),
onBeforeTaskSuccess: metrics2.GetCounter("bot_config_server_on_before_task_requests_success", map[string]string{"machine": m.MachineID}),
onAfterTaskSuccess: metrics2.GetCounter("bot_config_server_on_after_task_requests_success", map[string]string{"machine": m.MachineID}),
}
r.HandleFunc("/get_state", ret.getState).Methods("POST")
r.HandleFunc("/get_settings", ret.getSettings).Methods("GET")
r.HandleFunc("/get_dimensions", ret.getDimensions).Methods("POST")
r.HandleFunc(OnBeforeTaskPath, ret.onBeforeTask).Methods("GET")
r.HandleFunc(OnAfterTaskPath, ret.onAfterTask).Methods("GET")
r.Use(
httputils.HealthzAndHTTPS,
httputils.LoggingGzipRequestResponse,
)
return ret, nil
}
// getState implements get_state in botconfig.py.
//
// The input is a JSON dictionary via POST that is returned from os_utilities.get_state(), and will
// emit an updated JSON dictionary on return.
//
// https://chromium.googlesource.com/infra/luci/luci-py.git/+/refs/heads/main/appengine/swarming/swarming_bot/config/bot_config.py
func (s *Server) getState(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
s.getStateRequests.Inc(1)
dict := map[string]interface{}{}
if err := json.NewDecoder(r.Body).Decode(&dict); err != nil {
httputils.ReportError(w, err, "Failed to decode settings", http.StatusInternalServerError)
return
}
// Swarming doesn't get to decide a machine's maintenance or quarantined
// state. Always remove them since the presence of the key is the only thing
// that matters, the value is ignored.
delete(dict, "quarantined")
delete(dict, "maintenance")
// TODO(jcgregorio) Also gather/report device temp to Machine State.
dict["sk_rack"] = os.Getenv("MY_RACK_NAME")
if err := json.NewEncoder(w).Encode(dict); err != nil {
sklog.Errorf("Failed to encode state: %s", err)
return
}
s.getStateRequestsSuccess.Inc(1)
}
type isolated struct {
Size int64 `json:"size"`
}
type caches struct {
Isolated isolated `json:"isolated"`
}
type settings struct {
Caches caches `json:"caches"`
}
// getSettings implements get_settings for botconfig.py
//
// Will emit a JSON dictionary on GET with the settings.
//
// https://chromium.googlesource.com/infra/luci/luci-py.git/+/refs/heads/main/appengine/swarming/swarming_bot/config/bot_config.py
func (s *Server) getSettings(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
s.getSettingsRequests.Inc(1)
dict := settings{
caches{
isolated{
Size: 8 * 1024 * 1024 * 1024,
},
},
}
if err := json.NewEncoder(w).Encode(dict); err != nil {
sklog.Errorf("Failed to encode settings: %s", err)
return
}
s.getSettingsRequestsSuccess.Inc(1)
}
// getDimensions implements get_dimensions in bot_config.py.
//
// The input is a JSON dictionary via POST that is returned from
// os_utilities.get_dimensions(). This command will emit an updated JSON
// dictionary in the response.
//
// https://chromium.googlesource.com/infra/luci/luci-py.git/+show/master/appengine/swarming/swarming_bot/config/botconfig.py
func (s *Server) getDimensions(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
s.getDimensionsRequests.Inc(1)
dim := map[string][]string{}
if err := json.NewDecoder(r.Body).Decode(&dim); err != nil {
httputils.ReportError(w, err, "Failed to decode JSON input.", http.StatusInternalServerError)
return
}
for key, values := range s.machine.DimensionsForSwarming() {
if len(values) != 0 {
dim[key] = values
} else {
sklog.Errorf("Found bad value: %v for key: %q", values, key)
}
}
if len(dim) == 0 {
sklog.Error("Empty dimensions found in get_dimensions.")
http.Error(w, "No valid Dimensions to report", http.StatusInternalServerError)
}
// Now that we do dimension detection, we can now just send the value we want.
if err := json.NewEncoder(w).Encode(dim); err != nil {
sklog.Errorf("Failed to encode JSON output: %s", err)
return
}
s.getDimensionsRequestsSuccess.Inc(1)
}
// onBeforeTask implements the on_before_task hook in bot_config.py.
//
// No other data is passed with this call.
//
// https://chromium.googlesource.com/infra/luci/luci-py.git/+/refs/heads/main/appengine/swarming/swarming_bot/config/bot_config.py
func (s *Server) onBeforeTask(http.ResponseWriter, *http.Request) {
s.machine.SetIsRunningSwarmingTask(true)
s.onBeforeTaskSuccess.Inc(1)
}
// onAfterTask implements the on_after_task hook in bot_config.py.
//
// No other data is passed with this call.
//
// https://chromium.googlesource.com/infra/luci/luci-py.git/+/refs/heads/main/appengine/swarming/swarming_bot/config/bot_config.py
func (s *Server) onAfterTask(_ http.ResponseWriter, r *http.Request) {
s.machine.SetIsRunningSwarmingTask(false)
s.onAfterTaskSuccess.Inc(1)
// Don't use r.Context() here as that seems to get cancelled by Swarming
// pretty quickly.
// Do this in a Go routine so as we can return from this HTTP handler
// quickly.
go func() {
defer metrics2.FuncTimer().Stop()
// Do the reboot first so that the device will be ready
// when doing the interrogation.
if err := s.machine.RebootDevice(r.Context()); err != nil {
sklog.Warningf("Failed to reboot device: %s", err)
}
s.triggerInterrogationCh <- true
}()
}
// Start the http server. This function never returns.
func (s *Server) Start(port string) error {
// Start serving.
sklog.Info("Ready to serve.")
server := &http.Server{
Addr: port,
Handler: s.r,
ReadTimeout: serverReadTimeout,
WriteTimeout: serverWriteTimeout,
MaxHeaderBytes: 1 << 20,
}
return server.ListenAndServe()
}